Trajectory¶

The Trajectory and message classes define the data model for captured agent executions.

Trajectory¶

`Trajectory` ¶

Bases: BaseModel

A single agent execution trace on an Inspect sample.

Trajectories are grouped into Runs, which provide task and model context. A trajectory represents one sample's execution trace.

Source code in lunette/models/trajectory.py

class Trajectory(BaseModel):
    """A single agent execution trace on an Inspect sample.

    Trajectories are grouped into Runs, which provide task and model context.
    A trajectory represents one sample's execution trace.
    """

    sample: int | str
    """Inspect sample ID - identifies which sample this trajectory is for."""

    messages: list[Message]
    """Sequence of messages (System, User, Assistant, Tool) in this execution."""

    scores: dict[str, ScalarScore] | None = None
    """Multi-metric scores for this trajectory, if available."""

    metadata: dict[str, Any] = Field(default_factory=dict)
    """Additional metadata about this trajectory execution."""

    solution: str | None = None
    """Optional solution or patch produced by the agent."""

    sandbox_id: str | None = None
    """Optional sandbox container ID if this trajectory ran in a sandbox."""

    @computed_field
    @property
    def score(self) -> float | None:
        """Return the unique score value for the trajectory if it exists and `None` otherwise."""
        if self.scores is None or len(self.scores) != 1:
            return None
        [score] = self.scores.values()
        return score.value

    @classmethod
    def from_inspect(cls, sample: EvalSample) -> Trajectory:
        """Convert an Inspect AI `EvalSample` to a `Trajectory`.

        Args:
            sample: The Inspect AI sample to convert

        Returns:
            Trajectory object containing the sample's execution trace
        """

        # fail fast if the sample has an error
        if sample.error:
            raise ValueError(f"Sample {sample.id} has an error: {sample.error.message}")

        # Extract scores - handle both scalar values and dict values (e.g. control_arena)
        scores: dict[str, ScalarScore] | None = None
        if sample.scores is not None:
            scores = {}
            for name, score in sample.scores.items():
                if isinstance(score.value, dict):
                    # Dict score - split into separate entries with composite names
                    for subkey, subvalue in score.value.items():
                        composite_name = f"{name}/{subkey}" if name else subkey
                        scores[composite_name] = ScalarScore(
                            value=_normalize_score_value(subvalue),
                            answer=score.answer,
                            explanation=score.explanation,
                            metadata=score.metadata,
                        )
                else:
                    # Scalar score
                    scores[name] = ScalarScore(
                        value=_normalize_score_value(score.value),
                        answer=score.answer,
                        explanation=score.explanation,
                        metadata=score.metadata,
                    )

        # convert InspectAI `ChatMessage`s to our `Message`s
        messages: list[Message] = []
        tool_calls: dict[str, ToolCall] = {}  # tool call ID -> `ToolCall`

        for position, message in enumerate(sample.messages):
            match message:
                case ChatMessageAssistant():
                    assistant_message = AssistantMessage.from_inspect(position, message)
                    messages.append(assistant_message)
                    if assistant_message.tool_calls is not None:
                        for tool_call in assistant_message.tool_calls:
                            tool_calls[tool_call.id] = tool_call

                case ChatMessageTool():
                    tool_call_id = message.tool_call_id
                    if tool_call_id not in tool_calls:
                        raise ValueError(f"Tool call ID {tool_call_id} not found")
                    tool_message = ToolMessage.from_inspect(
                        position, message, tool_calls[tool_call_id]
                    )
                    messages.append(tool_message)

                case ChatMessageSystem():
                    system_message = SystemMessage.from_inspect(position, message)
                    messages.append(system_message)

                case ChatMessageUser():
                    user_message = UserMessage.from_inspect(position, message)
                    messages.append(user_message)

        # extract solution from metadata if available
        # TODO: make this more general; currently only supports the "patch" key (used in SWE-bench)
        solution: str | None = sample.metadata.get("patch", None)

        # extract sandbox_id from metadata if available
        sandbox_id: str | None = sample.metadata.get("lunette_sandbox_id", None)

        return cls(
            sample=sample.id,
            messages=messages,
            scores=scores,
            metadata=_sanitize_metadata(sample.metadata),
            solution=solution,
            sandbox_id=sandbox_id,
        )

`messages` `instance-attribute` ¶

Sequence of messages (System, User, Assistant, Tool) in this execution.

`metadata = Field(default_factory=dict)` `class-attribute` `instance-attribute` ¶

Additional metadata about this trajectory execution.

`sample` `instance-attribute` ¶

Inspect sample ID - identifies which sample this trajectory is for.

`sandbox_id = None` `class-attribute` `instance-attribute` ¶

Optional sandbox container ID if this trajectory ran in a sandbox.

`score` `property` ¶

Return the unique score value for the trajectory if it exists and None otherwise.

`scores = None` `class-attribute` `instance-attribute` ¶

Multi-metric scores for this trajectory, if available.

`solution = None` `class-attribute` `instance-attribute` ¶

Optional solution or patch produced by the agent.

`from_inspect(sample)` `classmethod` ¶

Convert an Inspect AI EvalSample to a Trajectory.

Parameters:

Name	Type	Description	Default
`sample`	`EvalSample`	The Inspect AI sample to convert	required

Returns:

Type	Description
`Trajectory`	Trajectory object containing the sample's execution trace

Source code in lunette/models/trajectory.py

@classmethod
def from_inspect(cls, sample: EvalSample) -> Trajectory:
    """Convert an Inspect AI `EvalSample` to a `Trajectory`.

    Args:
        sample: The Inspect AI sample to convert

    Returns:
        Trajectory object containing the sample's execution trace
    """

    # fail fast if the sample has an error
    if sample.error:
        raise ValueError(f"Sample {sample.id} has an error: {sample.error.message}")

    # Extract scores - handle both scalar values and dict values (e.g. control_arena)
    scores: dict[str, ScalarScore] | None = None
    if sample.scores is not None:
        scores = {}
        for name, score in sample.scores.items():
            if isinstance(score.value, dict):
                # Dict score - split into separate entries with composite names
                for subkey, subvalue in score.value.items():
                    composite_name = f"{name}/{subkey}" if name else subkey
                    scores[composite_name] = ScalarScore(
                        value=_normalize_score_value(subvalue),
                        answer=score.answer,
                        explanation=score.explanation,
                        metadata=score.metadata,
                    )
            else:
                # Scalar score
                scores[name] = ScalarScore(
                    value=_normalize_score_value(score.value),
                    answer=score.answer,
                    explanation=score.explanation,
                    metadata=score.metadata,
                )

    # convert InspectAI `ChatMessage`s to our `Message`s
    messages: list[Message] = []
    tool_calls: dict[str, ToolCall] = {}  # tool call ID -> `ToolCall`

    for position, message in enumerate(sample.messages):
        match message:
            case ChatMessageAssistant():
                assistant_message = AssistantMessage.from_inspect(position, message)
                messages.append(assistant_message)
                if assistant_message.tool_calls is not None:
                    for tool_call in assistant_message.tool_calls:
                        tool_calls[tool_call.id] = tool_call

            case ChatMessageTool():
                tool_call_id = message.tool_call_id
                if tool_call_id not in tool_calls:
                    raise ValueError(f"Tool call ID {tool_call_id} not found")
                tool_message = ToolMessage.from_inspect(
                    position, message, tool_calls[tool_call_id]
                )
                messages.append(tool_message)

            case ChatMessageSystem():
                system_message = SystemMessage.from_inspect(position, message)
                messages.append(system_message)

            case ChatMessageUser():
                user_message = UserMessage.from_inspect(position, message)
                messages.append(user_message)

    # extract solution from metadata if available
    # TODO: make this more general; currently only supports the "patch" key (used in SWE-bench)
    solution: str | None = sample.metadata.get("patch", None)

    # extract sandbox_id from metadata if available
    sandbox_id: str | None = sample.metadata.get("lunette_sandbox_id", None)

    return cls(
        sample=sample.id,
        messages=messages,
        scores=scores,
        metadata=_sanitize_metadata(sample.metadata),
        solution=solution,
        sandbox_id=sandbox_id,
    )

Run¶

`Run` ¶

Bases: BaseModel

A collection of trajectories from a single evaluation run.

This is the primary unit for uploading evaluation results. A run represents a single execution of inspect eval that produces multiple trajectory samples. All trajectories in a run share the same task and model.

Source code in lunette/models/run.py

class Run(BaseModel):
    """A collection of trajectories from a single evaluation run.

    This is the primary unit for uploading evaluation results. A run represents
    a single execution of `inspect eval` that produces multiple trajectory samples.
    All trajectories in a run share the same task and model.
    """

    id: str | None = None
    """Optional server-assigned run ID. If None, server generates a UUID. If provided, appends to existing run."""

    task: str
    """Task name for this run (e.g., 'math-eval', 'swe-bench')."""

    model: str
    """Model identifier used for this run (e.g., 'claude-sonnet-4', 'gpt-4')."""

    trajectories: list[Trajectory]
    """List of trajectory samples produced during this evaluation run."""

`id = None` `class-attribute` `instance-attribute` ¶

Optional server-assigned run ID. If None, server generates a UUID. If provided, appends to existing run.

`model` `instance-attribute` ¶

Model identifier used for this run (e.g., 'claude-sonnet-4', 'gpt-4').

`task` `instance-attribute` ¶

Task name for this run (e.g., 'math-eval', 'swe-bench').

`trajectories` `instance-attribute` ¶

List of trajectory samples produced during this evaluation run.

Messages¶

SystemMessage¶

`SystemMessage` ¶

Bases: BaseMessage

System message.

Source code in lunette/models/messages.py

class SystemMessage(BaseMessage):
    """System message."""

    role: Literal["system"] = "system"

    @classmethod
    def from_inspect(
        cls, position: int, message: InspectSystemMessage
    ) -> SystemMessage:
        """Convert an Inspect AI `ChatMessageSystem` to `SystemMessage`."""
        return cls(position=position, content=_content_from_inspect(message.content))

`from_inspect(position, message)` `classmethod` ¶

Convert an Inspect AI ChatMessageSystem to SystemMessage.

Source code in lunette/models/messages.py

@classmethod
def from_inspect(
    cls, position: int, message: InspectSystemMessage
) -> SystemMessage:
    """Convert an Inspect AI `ChatMessageSystem` to `SystemMessage`."""
    return cls(position=position, content=_content_from_inspect(message.content))

UserMessage¶

`UserMessage` ¶

Bases: BaseMessage

User message.

Source code in lunette/models/messages.py

class UserMessage(BaseMessage):
    """User message."""

    role: Literal["user"] = "user"

    @classmethod
    def from_inspect(cls, position: int, message: InspectUserMessage) -> UserMessage:
        """Convert an Inspect AI `ChatMessageUser` to `UserMessage`."""
        return cls(position=position, content=_content_from_inspect(message.content))

`from_inspect(position, message)` `classmethod` ¶

Convert an Inspect AI ChatMessageUser to UserMessage.

Source code in lunette/models/messages.py

@classmethod
def from_inspect(cls, position: int, message: InspectUserMessage) -> UserMessage:
    """Convert an Inspect AI `ChatMessageUser` to `UserMessage`."""
    return cls(position=position, content=_content_from_inspect(message.content))

AssistantMessage¶

`AssistantMessage` ¶

Bases: BaseMessage

Assistant message.

Source code in lunette/models/messages.py

class AssistantMessage(BaseMessage):
    """Assistant message."""

    role: Literal["assistant"] = "assistant"
    tool_calls: list[ToolCall] | None = None

    @classmethod
    def from_inspect(
        cls, position: int, message: InspectAssistantMessage
    ) -> AssistantMessage:
        """Convert an Inspect AI `ChatMessageAssistant` to `AssistantMessage`."""
        tool_calls = (
            [ToolCall.from_inspect(tool_call) for tool_call in message.tool_calls]
            if message.tool_calls
            else None
        )

        return cls(
            position=position,
            content=_content_from_inspect(message.content),
            tool_calls=tool_calls,
        )

`from_inspect(position, message)` `classmethod` ¶

Convert an Inspect AI ChatMessageAssistant to AssistantMessage.

Source code in lunette/models/messages.py

@classmethod
def from_inspect(
    cls, position: int, message: InspectAssistantMessage
) -> AssistantMessage:
    """Convert an Inspect AI `ChatMessageAssistant` to `AssistantMessage`."""
    tool_calls = (
        [ToolCall.from_inspect(tool_call) for tool_call in message.tool_calls]
        if message.tool_calls
        else None
    )

    return cls(
        position=position,
        content=_content_from_inspect(message.content),
        tool_calls=tool_calls,
    )

ToolMessage¶

`ToolMessage` ¶

Bases: BaseMessage

Tool message.

The content field contains the result of the tool call.

Source code in lunette/models/messages.py

class ToolMessage(BaseMessage):
    """
    Tool message.

    The `content` field contains the result of the tool call.
    """

    role: Literal["tool"] = "tool"
    tool_call: ToolCall

    @classmethod
    def from_inspect(
        cls,
        position: int,
        message: InspectToolMessage,
        tool_call: ToolCall,
    ) -> ToolMessage:
        """
        Convert an Inspect AI `ChatMessageTool` to `ToolMessage`.

        Args:
            position: Position in the trajectory
            message: The Inspect ChatMessageTool
            tool_call: The matching ToolCall (found by the caller)

        Returns:
            ToolMessage with proper tool_call reference
        """
        return cls(
            position=position,
            content=message.text,
            tool_call=tool_call,
        )

    @property
    def function(self) -> str:
        """Get the function name of this tool call."""
        return self.tool_call.function

    @property
    def arguments(self) -> dict[str, Any]:
        """Get the arguments of this tool call."""
        return self.tool_call.arguments

    @property
    def result(self) -> str:
        """Get the result of this tool call."""
        return self.text

`arguments` `property` ¶

Get the arguments of this tool call.

`function` `property` ¶

Get the function name of this tool call.

`result` `property` ¶

Get the result of this tool call.

`from_inspect(position, message, tool_call)` `classmethod` ¶

Convert an Inspect AI ChatMessageTool to ToolMessage.

Parameters:

Name	Type	Description	Default
`position`	`int`	Position in the trajectory	required
`message`	`ChatMessageTool`	The Inspect ChatMessageTool	required
`tool_call`	`ToolCall`	The matching ToolCall (found by the caller)	required

Returns:

Type	Description
`ToolMessage`	ToolMessage with proper tool_call reference

Source code in lunette/models/messages.py

@classmethod
def from_inspect(
    cls,
    position: int,
    message: InspectToolMessage,
    tool_call: ToolCall,
) -> ToolMessage:
    """
    Convert an Inspect AI `ChatMessageTool` to `ToolMessage`.

    Args:
        position: Position in the trajectory
        message: The Inspect ChatMessageTool
        tool_call: The matching ToolCall (found by the caller)

    Returns:
        ToolMessage with proper tool_call reference
    """
    return cls(
        position=position,
        content=message.text,
        tool_call=tool_call,
    )

ToolCall¶

`ToolCall` ¶

Bases: BaseModel

A tool call.

Does not include the result of the tool call, as it is not available until a later ToolMessage is received.

Source code in lunette/models/messages.py

class ToolCall(BaseModel):
    """
    A tool call.

    Does not include the result of the tool call, as it is not available until a later `ToolMessage` is received.
    """

    id: str
    function: str
    arguments: dict[str, Any]

    @classmethod
    def from_inspect(cls, tool_call: InspectToolCall) -> ToolCall:
        """Convert an Inspect AI `ToolCall` to our `ToolCall` model."""
        return cls(
            id=tool_call.id,
            function=tool_call.function,
            arguments=tool_call.arguments,
        )

`from_inspect(tool_call)` `classmethod` ¶

Convert an Inspect AI ToolCall to our ToolCall model.

Source code in lunette/models/messages.py

@classmethod
def from_inspect(cls, tool_call: InspectToolCall) -> ToolCall:
    """Convert an Inspect AI `ToolCall` to our `ToolCall` model."""
    return cls(
        id=tool_call.id,
        function=tool_call.function,
        arguments=tool_call.arguments,
    )

Scores¶

`ScalarScore` ¶

Bases: BaseModel

A scalar score for a trajectory.

Source code in lunette/models/trajectory.py

class ScalarScore(BaseModel):
    """A scalar score for a trajectory."""

    value: float
    """The value of the score."""

    answer: str | None = None
    """Answer extracted from model output, if available."""

    explanation: str | None = None
    """Explanation of the score, if available."""

    metadata: dict[str, Any] | None = None
    """Additional metadata about the score."""

`answer = None` `class-attribute` `instance-attribute` ¶

Answer extracted from model output, if available.

`explanation = None` `class-attribute` `instance-attribute` ¶

Explanation of the score, if available.

`metadata = None` `class-attribute` `instance-attribute` ¶

Additional metadata about the score.

`value` `instance-attribute` ¶

The value of the score.

Trajectory¶

Trajectory¶

Trajectory ¶

messages instance-attribute ¶

metadata = Field(default_factory=dict) class-attribute instance-attribute ¶

sample instance-attribute ¶

sandbox_id = None class-attribute instance-attribute ¶

score property ¶

scores = None class-attribute instance-attribute ¶

solution = None class-attribute instance-attribute ¶

from_inspect(sample) classmethod ¶

Run¶

Run ¶

id = None class-attribute instance-attribute ¶

model instance-attribute ¶

task instance-attribute ¶

trajectories instance-attribute ¶

Messages¶

SystemMessage¶

SystemMessage ¶

from_inspect(position, message) classmethod ¶

UserMessage¶

UserMessage ¶

from_inspect(position, message) classmethod ¶

AssistantMessage¶

AssistantMessage ¶

from_inspect(position, message) classmethod ¶

ToolMessage¶

ToolMessage ¶

arguments property ¶

function property ¶

result property ¶

from_inspect(position, message, tool_call) classmethod ¶

ToolCall¶

ToolCall ¶

from_inspect(tool_call) classmethod ¶

Scores¶

ScalarScore ¶

answer = None class-attribute instance-attribute ¶

explanation = None class-attribute instance-attribute ¶

metadata = None class-attribute instance-attribute ¶

value instance-attribute ¶

`Trajectory` ¶

`messages` `instance-attribute` ¶

`metadata = Field(default_factory=dict)` `class-attribute` `instance-attribute` ¶

`sample` `instance-attribute` ¶

`sandbox_id = None` `class-attribute` `instance-attribute` ¶

`score` `property` ¶

`scores = None` `class-attribute` `instance-attribute` ¶

`solution = None` `class-attribute` `instance-attribute` ¶

`from_inspect(sample)` `classmethod` ¶

`Run` ¶

`id = None` `class-attribute` `instance-attribute` ¶

`model` `instance-attribute` ¶

`task` `instance-attribute` ¶

`trajectories` `instance-attribute` ¶

`SystemMessage` ¶

`from_inspect(position, message)` `classmethod` ¶

`UserMessage` ¶

`from_inspect(position, message)` `classmethod` ¶

`AssistantMessage` ¶

`from_inspect(position, message)` `classmethod` ¶

`ToolMessage` ¶

`arguments` `property` ¶

`function` `property` ¶

`result` `property` ¶

`from_inspect(position, message, tool_call)` `classmethod` ¶

`ToolCall` ¶

`from_inspect(tool_call)` `classmethod` ¶

`ScalarScore` ¶

`answer = None` `class-attribute` `instance-attribute` ¶

`explanation = None` `class-attribute` `instance-attribute` ¶

`metadata = None` `class-attribute` `instance-attribute` ¶

`value` `instance-attribute` ¶