Skip to content

Trajectory

The Trajectory and message classes define the data model for captured agent executions.

Trajectory

Trajectory

Bases: BaseModel

A single agent execution trace on an Inspect sample.

Trajectories are grouped into Runs, which provide task and model context. A trajectory represents one sample's execution trace.

Source code in lunette/models/trajectory.py
class Trajectory(BaseModel):
    """A single agent execution trace on an Inspect sample.

    Trajectories are grouped into Runs, which provide task and model context.
    A trajectory represents one sample's execution trace.
    """

    sample: int | str
    """Inspect sample ID - identifies which sample this trajectory is for."""

    messages: list[Message]
    """Sequence of messages (System, User, Assistant, Tool) in this execution."""

    scores: dict[str, ScalarScore] | None = None
    """Multi-metric scores for this trajectory, if available."""

    metadata: dict[str, Any] = Field(default_factory=dict)
    """Additional metadata about this trajectory execution."""

    solution: str | None = None
    """Optional solution or patch produced by the agent."""

    sandbox_id: str | None = None
    """Optional sandbox container ID if this trajectory ran in a sandbox."""

    @computed_field
    @property
    def score(self) -> float | None:
        """Return the unique score value for the trajectory if it exists and `None` otherwise."""
        if self.scores is None or len(self.scores) != 1:
            return None
        [score] = self.scores.values()
        return score.value

    @classmethod
    def from_inspect(cls, sample: EvalSample) -> Trajectory:
        """Convert an Inspect AI `EvalSample` to a `Trajectory`.

        Args:
            sample: The Inspect AI sample to convert

        Returns:
            Trajectory object containing the sample's execution trace
        """

        # fail fast if the sample has an error
        if sample.error:
            raise ValueError(f"Sample {sample.id} has an error: {sample.error.message}")

        # Extract scores - handle both scalar values and dict values (e.g. control_arena)
        scores: dict[str, ScalarScore] | None = None
        if sample.scores is not None:
            scores = {}
            for name, score in sample.scores.items():
                if isinstance(score.value, dict):
                    # Dict score - split into separate entries with composite names
                    for subkey, subvalue in score.value.items():
                        composite_name = f"{name}/{subkey}" if name else subkey
                        scores[composite_name] = ScalarScore(
                            value=_normalize_score_value(subvalue),
                            answer=score.answer,
                            explanation=score.explanation,
                            metadata=score.metadata,
                        )
                else:
                    # Scalar score
                    scores[name] = ScalarScore(
                        value=_normalize_score_value(score.value),
                        answer=score.answer,
                        explanation=score.explanation,
                        metadata=score.metadata,
                    )

        # convert InspectAI `ChatMessage`s to our `Message`s
        messages: list[Message] = []
        tool_calls: dict[str, ToolCall] = {}  # tool call ID -> `ToolCall`

        for position, message in enumerate(sample.messages):
            match message:
                case ChatMessageAssistant():
                    assistant_message = AssistantMessage.from_inspect(position, message)
                    messages.append(assistant_message)
                    if assistant_message.tool_calls is not None:
                        for tool_call in assistant_message.tool_calls:
                            tool_calls[tool_call.id] = tool_call

                case ChatMessageTool():
                    tool_call_id = message.tool_call_id
                    if tool_call_id not in tool_calls:
                        raise ValueError(f"Tool call ID {tool_call_id} not found")
                    tool_message = ToolMessage.from_inspect(
                        position, message, tool_calls[tool_call_id]
                    )
                    messages.append(tool_message)

                case ChatMessageSystem():
                    system_message = SystemMessage.from_inspect(position, message)
                    messages.append(system_message)

                case ChatMessageUser():
                    user_message = UserMessage.from_inspect(position, message)
                    messages.append(user_message)

        # extract solution from metadata if available
        # TODO: make this more general; currently only supports the "patch" key (used in SWE-bench)
        solution: str | None = sample.metadata.get("patch", None)

        # extract sandbox_id from metadata if available
        sandbox_id: str | None = sample.metadata.get("lunette_sandbox_id", None)

        return cls(
            sample=sample.id,
            messages=messages,
            scores=scores,
            metadata=_sanitize_metadata(sample.metadata),
            solution=solution,
            sandbox_id=sandbox_id,
        )

messages instance-attribute

Sequence of messages (System, User, Assistant, Tool) in this execution.

metadata = Field(default_factory=dict) class-attribute instance-attribute

Additional metadata about this trajectory execution.

sample instance-attribute

Inspect sample ID - identifies which sample this trajectory is for.

sandbox_id = None class-attribute instance-attribute

Optional sandbox container ID if this trajectory ran in a sandbox.

score property

Return the unique score value for the trajectory if it exists and None otherwise.

scores = None class-attribute instance-attribute

Multi-metric scores for this trajectory, if available.

solution = None class-attribute instance-attribute

Optional solution or patch produced by the agent.

from_inspect(sample) classmethod

Convert an Inspect AI EvalSample to a Trajectory.

Parameters:

Name Type Description Default
sample EvalSample

The Inspect AI sample to convert

required

Returns:

Type Description
Trajectory

Trajectory object containing the sample's execution trace

Source code in lunette/models/trajectory.py
@classmethod
def from_inspect(cls, sample: EvalSample) -> Trajectory:
    """Convert an Inspect AI `EvalSample` to a `Trajectory`.

    Args:
        sample: The Inspect AI sample to convert

    Returns:
        Trajectory object containing the sample's execution trace
    """

    # fail fast if the sample has an error
    if sample.error:
        raise ValueError(f"Sample {sample.id} has an error: {sample.error.message}")

    # Extract scores - handle both scalar values and dict values (e.g. control_arena)
    scores: dict[str, ScalarScore] | None = None
    if sample.scores is not None:
        scores = {}
        for name, score in sample.scores.items():
            if isinstance(score.value, dict):
                # Dict score - split into separate entries with composite names
                for subkey, subvalue in score.value.items():
                    composite_name = f"{name}/{subkey}" if name else subkey
                    scores[composite_name] = ScalarScore(
                        value=_normalize_score_value(subvalue),
                        answer=score.answer,
                        explanation=score.explanation,
                        metadata=score.metadata,
                    )
            else:
                # Scalar score
                scores[name] = ScalarScore(
                    value=_normalize_score_value(score.value),
                    answer=score.answer,
                    explanation=score.explanation,
                    metadata=score.metadata,
                )

    # convert InspectAI `ChatMessage`s to our `Message`s
    messages: list[Message] = []
    tool_calls: dict[str, ToolCall] = {}  # tool call ID -> `ToolCall`

    for position, message in enumerate(sample.messages):
        match message:
            case ChatMessageAssistant():
                assistant_message = AssistantMessage.from_inspect(position, message)
                messages.append(assistant_message)
                if assistant_message.tool_calls is not None:
                    for tool_call in assistant_message.tool_calls:
                        tool_calls[tool_call.id] = tool_call

            case ChatMessageTool():
                tool_call_id = message.tool_call_id
                if tool_call_id not in tool_calls:
                    raise ValueError(f"Tool call ID {tool_call_id} not found")
                tool_message = ToolMessage.from_inspect(
                    position, message, tool_calls[tool_call_id]
                )
                messages.append(tool_message)

            case ChatMessageSystem():
                system_message = SystemMessage.from_inspect(position, message)
                messages.append(system_message)

            case ChatMessageUser():
                user_message = UserMessage.from_inspect(position, message)
                messages.append(user_message)

    # extract solution from metadata if available
    # TODO: make this more general; currently only supports the "patch" key (used in SWE-bench)
    solution: str | None = sample.metadata.get("patch", None)

    # extract sandbox_id from metadata if available
    sandbox_id: str | None = sample.metadata.get("lunette_sandbox_id", None)

    return cls(
        sample=sample.id,
        messages=messages,
        scores=scores,
        metadata=_sanitize_metadata(sample.metadata),
        solution=solution,
        sandbox_id=sandbox_id,
    )

Run

Run

Bases: BaseModel

A collection of trajectories from a single evaluation run.

This is the primary unit for uploading evaluation results. A run represents a single execution of inspect eval that produces multiple trajectory samples. All trajectories in a run share the same task and model.

Source code in lunette/models/run.py
class Run(BaseModel):
    """A collection of trajectories from a single evaluation run.

    This is the primary unit for uploading evaluation results. A run represents
    a single execution of `inspect eval` that produces multiple trajectory samples.
    All trajectories in a run share the same task and model.
    """

    id: str | None = None
    """Optional server-assigned run ID. If None, server generates a UUID. If provided, appends to existing run."""

    task: str
    """Task name for this run (e.g., 'math-eval', 'swe-bench')."""

    model: str
    """Model identifier used for this run (e.g., 'claude-sonnet-4', 'gpt-4')."""

    trajectories: list[Trajectory]
    """List of trajectory samples produced during this evaluation run."""

id = None class-attribute instance-attribute

Optional server-assigned run ID. If None, server generates a UUID. If provided, appends to existing run.

model instance-attribute

Model identifier used for this run (e.g., 'claude-sonnet-4', 'gpt-4').

task instance-attribute

Task name for this run (e.g., 'math-eval', 'swe-bench').

trajectories instance-attribute

List of trajectory samples produced during this evaluation run.

Messages

SystemMessage

SystemMessage

Bases: BaseMessage

System message.

Source code in lunette/models/messages.py
class SystemMessage(BaseMessage):
    """System message."""

    role: Literal["system"] = "system"

    @classmethod
    def from_inspect(
        cls, position: int, message: InspectSystemMessage
    ) -> SystemMessage:
        """Convert an Inspect AI `ChatMessageSystem` to `SystemMessage`."""
        return cls(position=position, content=_content_from_inspect(message.content))

from_inspect(position, message) classmethod

Convert an Inspect AI ChatMessageSystem to SystemMessage.

Source code in lunette/models/messages.py
@classmethod
def from_inspect(
    cls, position: int, message: InspectSystemMessage
) -> SystemMessage:
    """Convert an Inspect AI `ChatMessageSystem` to `SystemMessage`."""
    return cls(position=position, content=_content_from_inspect(message.content))

UserMessage

UserMessage

Bases: BaseMessage

User message.

Source code in lunette/models/messages.py
class UserMessage(BaseMessage):
    """User message."""

    role: Literal["user"] = "user"

    @classmethod
    def from_inspect(cls, position: int, message: InspectUserMessage) -> UserMessage:
        """Convert an Inspect AI `ChatMessageUser` to `UserMessage`."""
        return cls(position=position, content=_content_from_inspect(message.content))

from_inspect(position, message) classmethod

Convert an Inspect AI ChatMessageUser to UserMessage.

Source code in lunette/models/messages.py
@classmethod
def from_inspect(cls, position: int, message: InspectUserMessage) -> UserMessage:
    """Convert an Inspect AI `ChatMessageUser` to `UserMessage`."""
    return cls(position=position, content=_content_from_inspect(message.content))

AssistantMessage

AssistantMessage

Bases: BaseMessage

Assistant message.

Source code in lunette/models/messages.py
class AssistantMessage(BaseMessage):
    """Assistant message."""

    role: Literal["assistant"] = "assistant"
    tool_calls: list[ToolCall] | None = None

    @classmethod
    def from_inspect(
        cls, position: int, message: InspectAssistantMessage
    ) -> AssistantMessage:
        """Convert an Inspect AI `ChatMessageAssistant` to `AssistantMessage`."""
        tool_calls = (
            [ToolCall.from_inspect(tool_call) for tool_call in message.tool_calls]
            if message.tool_calls
            else None
        )

        return cls(
            position=position,
            content=_content_from_inspect(message.content),
            tool_calls=tool_calls,
        )

from_inspect(position, message) classmethod

Convert an Inspect AI ChatMessageAssistant to AssistantMessage.

Source code in lunette/models/messages.py
@classmethod
def from_inspect(
    cls, position: int, message: InspectAssistantMessage
) -> AssistantMessage:
    """Convert an Inspect AI `ChatMessageAssistant` to `AssistantMessage`."""
    tool_calls = (
        [ToolCall.from_inspect(tool_call) for tool_call in message.tool_calls]
        if message.tool_calls
        else None
    )

    return cls(
        position=position,
        content=_content_from_inspect(message.content),
        tool_calls=tool_calls,
    )

ToolMessage

ToolMessage

Bases: BaseMessage

Tool message.

The content field contains the result of the tool call.

Source code in lunette/models/messages.py
class ToolMessage(BaseMessage):
    """
    Tool message.

    The `content` field contains the result of the tool call.
    """

    role: Literal["tool"] = "tool"
    tool_call: ToolCall

    @classmethod
    def from_inspect(
        cls,
        position: int,
        message: InspectToolMessage,
        tool_call: ToolCall,
    ) -> ToolMessage:
        """
        Convert an Inspect AI `ChatMessageTool` to `ToolMessage`.

        Args:
            position: Position in the trajectory
            message: The Inspect ChatMessageTool
            tool_call: The matching ToolCall (found by the caller)

        Returns:
            ToolMessage with proper tool_call reference
        """
        return cls(
            position=position,
            content=message.text,
            tool_call=tool_call,
        )

    @property
    def function(self) -> str:
        """Get the function name of this tool call."""
        return self.tool_call.function

    @property
    def arguments(self) -> dict[str, Any]:
        """Get the arguments of this tool call."""
        return self.tool_call.arguments

    @property
    def result(self) -> str:
        """Get the result of this tool call."""
        return self.text

arguments property

Get the arguments of this tool call.

function property

Get the function name of this tool call.

result property

Get the result of this tool call.

from_inspect(position, message, tool_call) classmethod

Convert an Inspect AI ChatMessageTool to ToolMessage.

Parameters:

Name Type Description Default
position int

Position in the trajectory

required
message ChatMessageTool

The Inspect ChatMessageTool

required
tool_call ToolCall

The matching ToolCall (found by the caller)

required

Returns:

Type Description
ToolMessage

ToolMessage with proper tool_call reference

Source code in lunette/models/messages.py
@classmethod
def from_inspect(
    cls,
    position: int,
    message: InspectToolMessage,
    tool_call: ToolCall,
) -> ToolMessage:
    """
    Convert an Inspect AI `ChatMessageTool` to `ToolMessage`.

    Args:
        position: Position in the trajectory
        message: The Inspect ChatMessageTool
        tool_call: The matching ToolCall (found by the caller)

    Returns:
        ToolMessage with proper tool_call reference
    """
    return cls(
        position=position,
        content=message.text,
        tool_call=tool_call,
    )

ToolCall

ToolCall

Bases: BaseModel

A tool call.

Does not include the result of the tool call, as it is not available until a later ToolMessage is received.

Source code in lunette/models/messages.py
class ToolCall(BaseModel):
    """
    A tool call.

    Does not include the result of the tool call, as it is not available until a later `ToolMessage` is received.
    """

    id: str
    function: str
    arguments: dict[str, Any]

    @classmethod
    def from_inspect(cls, tool_call: InspectToolCall) -> ToolCall:
        """Convert an Inspect AI `ToolCall` to our `ToolCall` model."""
        return cls(
            id=tool_call.id,
            function=tool_call.function,
            arguments=tool_call.arguments,
        )

from_inspect(tool_call) classmethod

Convert an Inspect AI ToolCall to our ToolCall model.

Source code in lunette/models/messages.py
@classmethod
def from_inspect(cls, tool_call: InspectToolCall) -> ToolCall:
    """Convert an Inspect AI `ToolCall` to our `ToolCall` model."""
    return cls(
        id=tool_call.id,
        function=tool_call.function,
        arguments=tool_call.arguments,
    )

Scores

ScalarScore

Bases: BaseModel

A scalar score for a trajectory.

Source code in lunette/models/trajectory.py
class ScalarScore(BaseModel):
    """A scalar score for a trajectory."""

    value: float
    """The value of the score."""

    answer: str | None = None
    """Answer extracted from model output, if available."""

    explanation: str | None = None
    """Explanation of the score, if available."""

    metadata: dict[str, Any] | None = None
    """Additional metadata about the score."""

answer = None class-attribute instance-attribute

Answer extracted from model output, if available.

explanation = None class-attribute instance-attribute

Explanation of the score, if available.

metadata = None class-attribute instance-attribute

Additional metadata about the score.

value instance-attribute

The value of the score.