from typing import Dict
from maxim.evaluators import BaseEvaluator
from maxim.models import (
LocalEvaluatorResultParameter,
LocalEvaluatorReturn,
LocalData,
PassFailCriteria,
)
from maxim.models.evaluator import PassFailCriteriaOnEachEntry, PassFailCriteriaForTestrunOverall
class SimulationOutputsEvaluator(BaseEvaluator):
def evaluate(
self, result: LocalEvaluatorResultParameter, data: LocalData
) -> Dict[str, LocalEvaluatorReturn]:
if not result.simulation_outputs or len(result.simulation_outputs) == 0:
return {
"simulation-steps-validator": LocalEvaluatorReturn(
score=0, reasoning="No simulation outputs available"
),
}
expected_lines = [
line for line in data.get("Expected Steps", "").split("\n")
if line.strip()
]
expected_steps_count = len(expected_lines)
actual_steps_count = len(result.simulation_outputs)
steps_match = actual_steps_count >= expected_steps_count
return {
"simulation-steps-validator": LocalEvaluatorReturn(
score=1 if steps_match else 0,
reasoning=f"Simulation produced {actual_steps_count} steps, expected {expected_steps_count}",
),
}
def simulation_outputs_evaluator() -> SimulationOutputsEvaluator:
return SimulationOutputsEvaluator(
pass_fail_criteria={
"simulation-steps-validator": PassFailCriteria(
on_each_entry_pass_if=PassFailCriteriaOnEachEntry(score_should_be=">=", value=1),
for_testrun_overall_pass_if=PassFailCriteriaForTestrunOverall(
overall_should_be=">=", value=100, for_result="percentageOfPassedResults"
),
),
}
)
result = (
maxim.create_test_run("Simulation Test", workspace_id)
.with_data_structure(data_structure)
.with_simulation_config(SimulationConfig(max_turns=6))
.with_prompt_version_id(prompt_version_id) # or .with_workflow_id(workflow_id)
.with_data(manual_data)
.with_evaluators(simulation_outputs_evaluator())
.run()
)