Source code for tfg.evaluation.scenario_runner

import os
import sys
import io
from contextlib import redirect_stdout

# Add the root path of your modules
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from tfg.System.main import run_conversation
from tfg.evaluation.evaluator import evaluate_response 

BASE_DIR = os.path.dirname(__file__)
SCENARIO_FILE = os.path.join(os.path.dirname(__file__), "scenarios", "complex_scenarios.txt")
LOG_DIR = os.path.join(os.path.dirname(__file__), "logs")
METRIC_DIR = os.path.join(os.path.dirname(__file__), "metrics")

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(METRIC_DIR, exist_ok=True)



[docs]
def parse_scenarios(file_path):
    """Parses the scenarios file and returns a list of dicts with metadata and prompt"""
    scenarios = []
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Split scenarios using double newlines
    raw_scenarios = content.strip().split("\n\n")
    for block in raw_scenarios:
        lines = block.strip().split("\n")
        scenario = {
            "id": None,
            "difficulty": "unknown",
            "type": "unspecified",
            "requires": [],
            "prompt": "",
        }
        prompt_lines = []
        for line in lines:
            if line.startswith("# scenario_"):
                scenario["id"] = line.strip().replace("# ", "")
            elif line.startswith("# difficulty:"):
                scenario["difficulty"] = line.split(":", 1)[1].strip()
            elif line.startswith("# type:"):
                scenario["type"] = line.split(":", 1)[1].strip()
            elif line.startswith("# requires:"):
                scenario["requires"] = [r.strip() for r in line.split(":", 1)[1].split(",")]
            elif not line.startswith("#"):
                prompt_lines.append(line)
        scenario["prompt"] = " ".join(prompt_lines).strip()
        if scenario["id"] and scenario["prompt"]:
            scenarios.append(scenario)
    return scenarios




[docs]
def run_and_log(scenario):
    """Runs the conversation and logs the output and metrics"""
    print(f"\n▶️ Running {scenario['id']} [{scenario['difficulty']}]")

    # Redirect stdout to capture printed messages (Thought, Action, etc.)
    stdout_buffer = io.StringIO()
    with redirect_stdout(stdout_buffer):
        result = run_conversation(scenario["prompt"], return_full=True)
    debug_output = stdout_buffer.getvalue()

    # Save log file
    log_path = os.path.join(LOG_DIR, f"{scenario['id']}.log")
    with open(log_path, "w", encoding="utf-8") as f:
        f.write("=== PROMPT ===\n")
        f.write(scenario["prompt"] + "\n\n")

        f.write("=== DEBUG OUTPUT ===\n")
        f.write(debug_output + "\n")

        f.write("=== MESSAGES ===\n")
        for msg in result["messages"]:
            msg_type = msg.__class__.__name__
            content = getattr(msg, "content", "")
            f.write(f"{msg_type}: {content}\n")

        assistant_response = next(
            (m.content for m in reversed(result["messages"]) if m.__class__.__name__ == "AIMessage"),
            "NO ASSISTANT RESPONSE"
        )
        f.write("\n=== FINAL RESPONSE (from last AIMessage) ===\n")
        f.write(assistant_response + "\n")

    # Evaluate metrics
    metrics = evaluate_response(result, scenario["prompt"])
    metric_path = os.path.join(METRIC_DIR, f"{scenario['id']}_metrics.txt")
    with open(metric_path, "w", encoding="utf-8") as f:
        for k, v in metrics.items():
            f.write(f"{k}: {v}\n")

    print(f"✅ Done: Logs → {log_path}, Metrics → {metric_path}")





[docs]
def run_all():
    """Main loop: parse scenarios and run each"""
    print("Path:", SCENARIO_FILE)
    scenarios = parse_scenarios(SCENARIO_FILE)
    for scenario in scenarios:
        run_and_log(scenario)



if __name__ == "__main__":
    run_all()