Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions packages/optimization/src/ldai_optimization/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
OptimizationResultPayload,
)
from ldai_optimization.prompts import (
_acceptance_criteria_implies_duration_optimization,
build_message_history_text,
build_new_variation_prompt,
build_reasoning_history,
Expand Down Expand Up @@ -80,6 +81,12 @@ def _compute_validation_count(pool_size: int) -> int:
# the variation step is treated as a failure.
_MAX_VARIATION_RETRIES = 3

# Duration gate: a candidate must be at least this much faster than the baseline
# (history[0].duration_ms) to pass the duration check when acceptance criteria
# imply a latency optimization goal. 0.80 means the candidate must clock in at
# under 80% of the baseline — i.e. at least 20% improvement.
_DURATION_TOLERANCE = 0.80

# Maps SDK status strings to the API status/activity values expected by
# agent_optimization_result records. Defined at module level to avoid
# allocating the dict on every on_status_update invocation.
Expand Down Expand Up @@ -328,6 +335,7 @@ async def _call_judges(
variables: Optional[Dict[str, Any]] = None,
agent_tools: Optional[List[ToolDefinition]] = None,
expected_response: Optional[str] = None,
agent_duration_ms: Optional[float] = None,
) -> Dict[str, JudgeResult]:
"""
Call all judges in parallel (auto-path).
Expand All @@ -344,6 +352,9 @@ async def _call_judges(
:param agent_tools: Normalised list of tool dicts that were available to the agent
:param expected_response: Optional ground truth expected response. When provided,
judges are instructed to factor it into their scoring alongside acceptance criteria.
:param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
Forwarded to acceptance judges whose statement implies a latency goal so they
can mention the duration change in their rationale.
:return: Dictionary of judge results (score and rationale)
"""
if not self._options.judges:
Expand Down Expand Up @@ -396,6 +407,7 @@ async def _call_judges(
variables=resolved_variables,
agent_tools=resolved_agent_tools,
expected_response=expected_response,
agent_duration_ms=agent_duration_ms,
)
judge_results[judge_key] = result

Expand Down Expand Up @@ -613,6 +625,7 @@ async def _evaluate_acceptance_judge(
variables: Optional[Dict[str, Any]] = None,
agent_tools: Optional[List[ToolDefinition]] = None,
expected_response: Optional[str] = None,
agent_duration_ms: Optional[float] = None,
) -> JudgeResult:
"""
Evaluate using an acceptance statement judge.
Expand All @@ -627,6 +640,9 @@ async def _evaluate_acceptance_judge(
:param agent_tools: Normalised list of tool dicts that were available to the agent
:param expected_response: Optional ground truth expected response. When provided,
injected into instructions and judge message so the judge can score actual vs. expected.
:param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
When the acceptance statement implies a latency goal, the judge is instructed
to mention the duration change in its rationale.
:return: The judge result with score and rationale
"""
if not optimization_judge.acceptance_statement:
Expand Down Expand Up @@ -662,6 +678,32 @@ async def _evaluate_acceptance_judge(
'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}'
)

if (
agent_duration_ms is not None
and _acceptance_criteria_implies_duration_optimization(
{judge_key: optimization_judge}
)
):
baseline_ms = (
self._history[0].duration_ms
if self._history and self._history[0].duration_ms is not None
else None
)
instructions += (
f"\n\nThe acceptance criteria for this judge includes a latency/duration goal. "
f"The agent's response took {agent_duration_ms:.0f}ms to generate. "
)
if baseline_ms is not None:
delta_ms = agent_duration_ms - baseline_ms
direction = "faster" if delta_ms < 0 else "slower"
instructions += (
f"The baseline duration (first iteration) was {baseline_ms:.0f}ms. "
f"This response was {abs(delta_ms):.0f}ms {direction} than the baseline. "
)
instructions += (
"Please mention the duration and any change from baseline in your rationale."
)

if resolved_variables:
instructions += f"\n\nThe following variables were available to the agent: {json.dumps(resolved_variables)}"

Expand Down Expand Up @@ -911,6 +953,11 @@ async def _run_ground_truth_optimization(
else:
sample_passed = self._evaluate_response(optimize_context)

if sample_passed and _acceptance_criteria_implies_duration_optimization(
self._options.judges
):
sample_passed = self._evaluate_duration(optimize_context)

if not sample_passed:
logger.info(
"[GT Attempt %d] -> Sample %d/%d FAILED",
Expand Down Expand Up @@ -1147,6 +1194,9 @@ async def _generate_new_variation(
)
self._safe_status_update("generating variation", status_ctx, iteration)

optimize_for_duration = _acceptance_criteria_implies_duration_optimization(
self._options.judges
)
instructions = build_new_variation_prompt(
self._history,
self._options.judges,
Expand All @@ -1156,6 +1206,7 @@ async def _generate_new_variation(
self._options.model_choices,
self._options.variable_choices,
self._initial_instructions,
optimize_for_duration=optimize_for_duration,
)

# Create a flat history list (without nested history) to avoid exponential growth
Expand Down Expand Up @@ -1486,6 +1537,7 @@ async def _execute_agent_turn(
variables=optimize_context.current_variables,
agent_tools=agent_tools,
expected_response=expected_response,
agent_duration_ms=agent_duration_ms,
)

return dataclasses.replace(
Expand Down Expand Up @@ -1523,6 +1575,38 @@ def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:

return True

def _evaluate_duration(self, optimize_context: OptimizationContext) -> bool:
"""
Check whether the candidate's duration meets the improvement target vs. the baseline.

The baseline is history[0].duration_ms — the very first completed iteration,
representing the original unoptimized configuration's latency. The candidate
must be at least _DURATION_TOLERANCE faster (default: 20% improvement).

Returns True without blocking when no baseline is available (empty history or
history[0].duration_ms is None), or when the candidate's duration_ms was not
captured. This avoids penalising configurations when timing data is missing.

:param optimize_context: The completed turn context containing duration_ms
:return: True if the duration requirement is met or cannot be checked
"""
if not self._history or self._history[0].duration_ms is None:
return True
if optimize_context.duration_ms is None:
return True
baseline = self._history[0].duration_ms
passed = optimize_context.duration_ms < baseline * _DURATION_TOLERANCE
if not passed:
logger.warning(
"[Iteration %d] -> Duration check failed: %.0fms >= baseline %.0fms * %.0f%% (%.0fms)",
optimize_context.iteration,
optimize_context.duration_ms,
baseline,
_DURATION_TOLERANCE * 100,
baseline * _DURATION_TOLERANCE,
)
return passed

def _handle_success(
self, optimize_context: OptimizationContext, iteration: int
) -> Any:
Expand Down Expand Up @@ -1691,6 +1775,11 @@ async def _run_validation_phase(
else:
sample_passed = self._evaluate_response(val_ctx)

if sample_passed and _acceptance_criteria_implies_duration_optimization(
self._options.judges
):
sample_passed = self._evaluate_duration(val_ctx)

last_ctx = val_ctx

if not sample_passed:
Expand Down Expand Up @@ -1798,6 +1887,11 @@ async def _run_optimization(
iteration,
)

if initial_passed and _acceptance_criteria_implies_duration_optimization(
self._options.judges
):
initial_passed = self._evaluate_duration(optimize_context)

if initial_passed:
all_valid, last_ctx = await self._run_validation_phase(
optimize_context, iteration
Expand Down
68 changes: 68 additions & 0 deletions packages/optimization/src/ldai_optimization/prompts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
"""Prompt-building functions for LaunchDarkly AI optimization."""

import re
from typing import Any, Dict, List, Optional

from ldai_optimization.dataclasses import (
OptimizationContext,
OptimizationJudge,
)

_DURATION_KEYWORDS = re.compile(
r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|"
r"time\s+to\s+respond|milliseconds|performant|snappy|efficient|seconds)\b|"
r"(?<![a-zA-Z])ms\b",
re.IGNORECASE,
)


def _acceptance_criteria_implies_duration_optimization(
judges: Optional[Dict[str, OptimizationJudge]],
) -> bool:
"""Return True if any judge acceptance statement implies a latency optimization goal.

Scans each judge's acceptance_statement for latency-related keywords. The
check is case-insensitive. Returns False when judges is None or no judge
carries an acceptance statement.

:param judges: Judge configuration dict from OptimizationOptions, or None.
:return: True if duration optimization should be applied.
"""
if not judges:
return False
for judge in judges.values():
if judge.acceptance_statement and _DURATION_KEYWORDS.search(
judge.acceptance_statement
):
return True
return False


def build_message_history_text(
history: List[OptimizationContext],
Expand Down Expand Up @@ -82,6 +112,7 @@ def build_new_variation_prompt(
model_choices: List[str],
variable_choices: List[Dict[str, Any]],
initial_instructions: str,
optimize_for_duration: bool = False,
) -> str:
"""
Build the LLM prompt for generating an improved agent configuration.
Expand All @@ -99,6 +130,8 @@ def build_new_variation_prompt(
:param model_choices: List of model IDs the LLM may select from
:param variable_choices: List of variable dicts (used to derive placeholder names)
:param initial_instructions: The original unmodified instructions template
:param optimize_for_duration: When True, appends a duration optimization section
instructing the LLM to prefer faster models and simpler instructions.
:return: The assembled prompt string
"""
sections = [
Expand All @@ -112,6 +145,7 @@ def build_new_variation_prompt(
variation_prompt_improvement_instructions(
history, model_choices, variable_choices, initial_instructions
),
variation_prompt_duration_optimization(model_choices) if optimize_for_duration else "",
]

return "\n\n".join(s for s in sections if s)
Expand Down Expand Up @@ -211,6 +245,8 @@ def variation_prompt_configuration(
if previous_ctx.user_input:
lines.append(f"User question: {previous_ctx.user_input}")
lines.append(f"Agent response: {previous_ctx.completion_response}")
if previous_ctx.duration_ms is not None:
lines.append(f"Agent duration: {previous_ctx.duration_ms:.0f}ms")
return "\n".join(lines)
else:
return "\n".join(
Expand Down Expand Up @@ -262,6 +298,8 @@ def variation_prompt_feedback(
if result.rationale:
feedback_line += f"\n Reasoning: {result.rationale}"
lines.append(feedback_line)
if ctx.duration_ms is not None:
lines.append(f"Agent duration: {ctx.duration_ms:.0f}ms")
return "\n".join(lines)


Expand Down Expand Up @@ -487,3 +525,33 @@ def variation_prompt_improvement_instructions(
parameters_instructions,
]
)


def variation_prompt_duration_optimization(model_choices: List[str]) -> str:
"""
Duration optimization section of the variation prompt.

Included when acceptance criteria imply a latency reduction goal. Instructs
the LLM to treat response speed as a secondary objective — quality criteria
must still be met first — and provides concrete guidance on how to reduce
latency through model selection and instruction simplification.

:param model_choices: List of model IDs the LLM may select from, so it can
apply its own knowledge of which models tend to be faster.
:return: The duration optimization prompt block.
"""
return "\n".join(
[
"## Duration Optimization:",
"The acceptance criteria for this optimization implies that response latency should be reduced.",
"In addition to improving quality, generate a variation that aims to reduce the agent's response time.",
"You may:",
"- Select a faster model from the available choices if quality requirements can still be met.",
f" Available models: {model_choices}",
" Use your knowledge of these models to prefer those that are known to respond more quickly.",
"- Simplify or shorten the instructions where this does not compromise the acceptance criteria.",
" Shorter prompts reduce input token counts and typically yield faster responses.",
"- Avoid increasing max_tokens or other parameters that extend generation time.",
"Quality criteria remain the primary objective — do not sacrifice passing scores to achieve lower latency.",
]
)
Loading
Loading