Skip to content

Evaluators API Reference

Check that the reply contains expected substrings.

Parameters:

Name Type Description Default
any_of list[str]

Reply must contain at least one of these strings (case-insensitive).

list()
all_of list[str]

Reply must contain every one of these strings (case-insensitive).

list()
Example
ContainsEvaluator(any_of=["confirmed", "booked"])
ContainsEvaluator(all_of=["booking", "reference number"])
Source code in src/pytest_agent_eval/evaluators/contains.py
@dataclass
class ContainsEvaluator:
    """Check that the reply contains expected substrings.

    Args:
        any_of: Reply must contain at least one of these strings (case-insensitive).
        all_of: Reply must contain every one of these strings (case-insensitive).

    Example:
        ```python
        ContainsEvaluator(any_of=["confirmed", "booked"])
        ContainsEvaluator(all_of=["booking", "reference number"])
        ```
    """

    any_of: list[str] = field(default_factory=list)
    all_of: list[str] = field(default_factory=list)

    async def evaluate(self, ctx: TurnContext) -> EvalResult:
        """Evaluate substring presence in the reply."""
        lowered = ctx.reply.lower()

        if self.any_of and not any(s.lower() in lowered for s in self.any_of):
            return EvalResult(
                passed=False,
                reasoning=f"Reply did not contain any of {self.any_of!r}",
            )

        missing = [s for s in self.all_of if s.lower() not in lowered]
        if missing:
            return EvalResult(
                passed=False,
                reasoning=f"Reply missing required strings: {missing!r}",
            )

        return EvalResult(passed=True, reasoning="All substring checks passed")

evaluate(ctx: TurnContext) -> EvalResult async

Evaluate substring presence in the reply.

Source code in src/pytest_agent_eval/evaluators/contains.py
async def evaluate(self, ctx: TurnContext) -> EvalResult:
    """Evaluate substring presence in the reply."""
    lowered = ctx.reply.lower()

    if self.any_of and not any(s.lower() in lowered for s in self.any_of):
        return EvalResult(
            passed=False,
            reasoning=f"Reply did not contain any of {self.any_of!r}",
        )

    missing = [s for s in self.all_of if s.lower() not in lowered]
    if missing:
        return EvalResult(
            passed=False,
            reasoning=f"Reply missing required strings: {missing!r}",
        )

    return EvalResult(passed=True, reasoning="All substring checks passed")

Validate that specific tools were (or were not) called.

Parameters:

Name Type Description Default
must_include list[str]

Tool names that must appear in tool_calls.

list()
must_exclude list[str]

Tool names that must NOT appear in tool_calls.

list()
ordered bool

If True, must_include tools must appear in the given order.

False
Example
ToolCallEvaluator(must_include=["book_slot"], must_exclude=["cancel_slot"])
ToolCallEvaluator(must_include=["auth", "fetch", "respond"], ordered=True)
Source code in src/pytest_agent_eval/evaluators/tool_call.py
@dataclass
class ToolCallEvaluator:
    """Validate that specific tools were (or were not) called.

    Args:
        must_include: Tool names that must appear in tool_calls.
        must_exclude: Tool names that must NOT appear in tool_calls.
        ordered: If True, must_include tools must appear in the given order.

    Example:
        ```python
        ToolCallEvaluator(must_include=["book_slot"], must_exclude=["cancel_slot"])
        ToolCallEvaluator(must_include=["auth", "fetch", "respond"], ordered=True)
        ```
    """

    must_include: list[str] = field(default_factory=list)
    must_exclude: list[str] = field(default_factory=list)
    ordered: bool = False

    async def evaluate(self, ctx: TurnContext) -> EvalResult:
        """Evaluate tool call presence and ordering."""
        failures: list[str] = []

        if not self.ordered:
            for tool in self.must_include:
                if tool not in ctx.tool_calls:
                    failures.append(f"Expected tool {tool!r} not in {ctx.tool_calls!r}")

        for tool in self.must_exclude:
            if tool in ctx.tool_calls:
                failures.append(f"Forbidden tool {tool!r} was called")

        if self.ordered and self.must_include:
            if not _is_ordered_subsequence(self.must_include, ctx.tool_calls):
                failures.append(f"Tools {self.must_include!r} not called in order in {ctx.tool_calls!r}")

        if failures:
            return EvalResult(passed=False, reasoning="\n".join(failures))
        return EvalResult(passed=True, reasoning="All tool call checks passed")

evaluate(ctx: TurnContext) -> EvalResult async

Evaluate tool call presence and ordering.

Source code in src/pytest_agent_eval/evaluators/tool_call.py
async def evaluate(self, ctx: TurnContext) -> EvalResult:
    """Evaluate tool call presence and ordering."""
    failures: list[str] = []

    if not self.ordered:
        for tool in self.must_include:
            if tool not in ctx.tool_calls:
                failures.append(f"Expected tool {tool!r} not in {ctx.tool_calls!r}")

    for tool in self.must_exclude:
        if tool in ctx.tool_calls:
            failures.append(f"Forbidden tool {tool!r} was called")

    if self.ordered and self.must_include:
        if not _is_ordered_subsequence(self.must_include, ctx.tool_calls):
            failures.append(f"Tools {self.must_include!r} not called in order in {ctx.tool_calls!r}")

    if failures:
        return EvalResult(passed=False, reasoning="\n".join(failures))
    return EvalResult(passed=True, reasoning="All tool call checks passed")

Use an LLM to evaluate the reply against a rubric.

Uses pydantic-ai under the hood; supports any pydantic-ai compatible model.

Parameters:

Name Type Description Default
rubric str

Natural language rubric describing what a passing reply looks like.

required
model str | None

pydantic-ai model string (e.g. "openai:gpt-4o"). Falls back to [tool.agent_eval] model in pyproject.toml if None.

None
retries int

Number of retry attempts on API failure before returning a FAIL verdict.

2
timeout float

Seconds before the judge call times out.

30.0
Example
JudgeEvaluator(
    rubric="Reply must confirm booking with date and time",
    model="anthropic:claude-3-5-sonnet-latest",
)
Source code in src/pytest_agent_eval/evaluators/judge.py
@dataclass
class JudgeEvaluator:
    """Use an LLM to evaluate the reply against a rubric.

    Uses pydantic-ai under the hood; supports any pydantic-ai compatible model.

    Args:
        rubric: Natural language rubric describing what a passing reply looks like.
        model: pydantic-ai model string (e.g. ``"openai:gpt-4o"``). Falls back to
            ``[tool.agent_eval] model`` in pyproject.toml if None.
        retries: Number of retry attempts on API failure before returning a FAIL verdict.
        timeout: Seconds before the judge call times out.

    Example:
        ```python
        JudgeEvaluator(
            rubric="Reply must confirm booking with date and time",
            model="anthropic:claude-3-5-sonnet-latest",
        )
        ```
    """

    rubric: str
    model: str | None = None
    retries: int = 2
    timeout: float = 30.0
    _agent: "Agent[None, _JudgeOutput] | None" = field(default=None, init=False, repr=False)

    def _get_agent(self) -> "Agent[None, _JudgeOutput]":
        if self._agent is None:
            from pytest_agent_eval.config import load_config_from_toml

            model_id = self.model or load_config_from_toml(Path("pyproject.toml")).model
            self._agent = Agent(model_id, output_type=_JudgeOutput, system_prompt=_SYSTEM_PROMPT)
        return self._agent

    async def evaluate(self, ctx: TurnContext) -> EvalResult:
        """Run the LLM judge against the turn and return its verdict."""
        agent = self._get_agent()
        user_msg = _format_judge_prompt(self.rubric, ctx)

        last_error: Exception | None = None
        for _ in range(self.retries + 1):
            try:
                result = await asyncio.wait_for(agent.run(user_msg), timeout=self.timeout)
                output = result.output
                return EvalResult(passed=output.passed, reasoning=output.reasoning)
            except Exception as exc:
                last_error = exc

        return EvalResult(
            passed=False,
            reasoning=f"Judge failed after {self.retries + 1} attempts: {last_error}",
        )

evaluate(ctx: TurnContext) -> EvalResult async

Run the LLM judge against the turn and return its verdict.

Source code in src/pytest_agent_eval/evaluators/judge.py
async def evaluate(self, ctx: TurnContext) -> EvalResult:
    """Run the LLM judge against the turn and return its verdict."""
    agent = self._get_agent()
    user_msg = _format_judge_prompt(self.rubric, ctx)

    last_error: Exception | None = None
    for _ in range(self.retries + 1):
        try:
            result = await asyncio.wait_for(agent.run(user_msg), timeout=self.timeout)
            output = result.output
            return EvalResult(passed=output.passed, reasoning=output.reasoning)
        except Exception as exc:
            last_error = exc

    return EvalResult(
        passed=False,
        reasoning=f"Judge failed after {self.retries + 1} attempts: {last_error}",
    )