Source code for tfg.evaluation.scenario_runner

import os
import sys
import io
from contextlib import redirect_stdout

# Add the root path of your modules
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from tfg.System.main import run_conversation
from tfg.evaluation.evaluator import evaluate_response 

BASE_DIR = os.path.dirname(__file__)
SCENARIO_FILE = os.path.join(os.path.dirname(__file__), "scenarios", "complex_scenarios.txt")
LOG_DIR = os.path.join(os.path.dirname(__file__), "logs")
METRIC_DIR = os.path.join(os.path.dirname(__file__), "metrics")

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(METRIC_DIR, exist_ok=True)


[docs] def parse_scenarios(file_path): """Parses the scenarios file and returns a list of dicts with metadata and prompt""" scenarios = [] with open(file_path, "r", encoding="utf-8") as f: content = f.read() # Split scenarios using double newlines raw_scenarios = content.strip().split("\n\n") for block in raw_scenarios: lines = block.strip().split("\n") scenario = { "id": None, "difficulty": "unknown", "type": "unspecified", "requires": [], "prompt": "", } prompt_lines = [] for line in lines: if line.startswith("# scenario_"): scenario["id"] = line.strip().replace("# ", "") elif line.startswith("# difficulty:"): scenario["difficulty"] = line.split(":", 1)[1].strip() elif line.startswith("# type:"): scenario["type"] = line.split(":", 1)[1].strip() elif line.startswith("# requires:"): scenario["requires"] = [r.strip() for r in line.split(":", 1)[1].split(",")] elif not line.startswith("#"): prompt_lines.append(line) scenario["prompt"] = " ".join(prompt_lines).strip() if scenario["id"] and scenario["prompt"]: scenarios.append(scenario) return scenarios
[docs] def run_and_log(scenario): """Runs the conversation and logs the output and metrics""" print(f"\n▶️ Running {scenario['id']} [{scenario['difficulty']}]") # Redirect stdout to capture printed messages (Thought, Action, etc.) stdout_buffer = io.StringIO() with redirect_stdout(stdout_buffer): result = run_conversation(scenario["prompt"], return_full=True) debug_output = stdout_buffer.getvalue() # Save log file log_path = os.path.join(LOG_DIR, f"{scenario['id']}.log") with open(log_path, "w", encoding="utf-8") as f: f.write("=== PROMPT ===\n") f.write(scenario["prompt"] + "\n\n") f.write("=== DEBUG OUTPUT ===\n") f.write(debug_output + "\n") f.write("=== MESSAGES ===\n") for msg in result["messages"]: msg_type = msg.__class__.__name__ content = getattr(msg, "content", "") f.write(f"{msg_type}: {content}\n") assistant_response = next( (m.content for m in reversed(result["messages"]) if m.__class__.__name__ == "AIMessage"), "NO ASSISTANT RESPONSE" ) f.write("\n=== FINAL RESPONSE (from last AIMessage) ===\n") f.write(assistant_response + "\n") # Evaluate metrics metrics = evaluate_response(result, scenario["prompt"]) metric_path = os.path.join(METRIC_DIR, f"{scenario['id']}_metrics.txt") with open(metric_path, "w", encoding="utf-8") as f: for k, v in metrics.items(): f.write(f"{k}: {v}\n") print(f"✅ Done: Logs → {log_path}, Metrics → {metric_path}")
[docs] def run_all(): """Main loop: parse scenarios and run each""" print("Path:", SCENARIO_FILE) scenarios = parse_scenarios(SCENARIO_FILE) for scenario in scenarios: run_and_log(scenario)
if __name__ == "__main__": run_all()