Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ dependencies = [
# Used by authentication/k8s integration
"kubernetes>=30.1.0",
# Used to call Llama Stack APIs
"llama-stack==0.5.2",
"llama-stack-client==0.5.2",
"llama-stack-api==0.5.2",
"llama-stack==0.6.0",
"llama-stack-client==0.6.0",
"llama-stack-api==0.6.0",
# Used by Logger
"rich>=14.0.0",
# Used by JWK token auth handler
Expand Down Expand Up @@ -160,6 +160,7 @@ llslibdev = [
"faiss-cpu>=1.11.0",
"chardet>=5.2.0",
"psycopg2-binary>=2.9.10",
"pypdf>=6.9.2",
# API scoring: inline::basic
"requests>=2.33.0",
# API datasetio: inline::localfs
Expand Down
3 changes: 2 additions & 1 deletion requirements-build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ flit-core==3.12.0
# jinja2
# packaging
# pathspec
# pypdf
# wheel
hatch-fancy-pypi-readme==25.1.0
# via
Expand Down Expand Up @@ -63,7 +64,7 @@ jinja2==3.1.6
# via uv-dynamic-versioning
markupsafe==3.0.3
# via jinja2
maturin==1.10.2
maturin==1.12.6
# via fastuuid
packaging==26.0
# via
Expand Down
27 changes: 15 additions & 12 deletions requirements.hashes.source.txt
Original file line number Diff line number Diff line change
Expand Up @@ -509,15 +509,15 @@ langdetect==1.0.9 \
litellm==1.82.6 \
--hash=sha256:164a3ef3e19f309e3cabc199bef3d2045212712fefdfa25fc7f75884a5b5b205 \
--hash=sha256:2aa1c2da21fe940c33613aa447119674a3ad4d2ad5eb064e4d5ce5ee42420136
llama-stack==0.5.2 \
--hash=sha256:581fda638088ee029aab20afe3c42ba8f7f6ef21c80bd9ebcae20bb13c3409d3 \
--hash=sha256:9334c781e4ded6520aa60c3301a9087e9fb8fdaea8e5f30f8e21d85b17231d8d
llama-stack-api==0.5.2 \
--hash=sha256:6531556dd8bb6555d778360ecfcd850aad7a49a8172b68146995d538e71641f0 \
--hash=sha256:a272e4b803fe24a8ba7d22e6d904bf88abd118ba0b6610a20ff5dedb09f38ad7
llama-stack-client==0.5.2 \
--hash=sha256:17c1bbad90f7699da4eb3cae256e8823caa4d2be945512a45c8c6f89ab899f28 \
--hash=sha256:473f4d67ac0b243b0fc29555a0203a742615d31bea606b4332d9e2f193f73d6a
llama-stack==0.6.0 \
--hash=sha256:b804830664dc91e54c7225a7a081cb1874c48fc18573569c19fac4a9397e8076 \
--hash=sha256:d92711791633f5505a4473ffba3f3e26acb700716fddab5aec419d99e614c802
llama-stack-api==0.6.0 \
--hash=sha256:b99a03aba3659736b6b540c9e5e674b1daac2bf5eeb2a68795113d62b8250672 \
--hash=sha256:f0f3a1a6239a5d3b8c7ef02cefdf817c96c6461dcd8a82c1689ac67ec3107270
llama-stack-client==0.6.0 \
--hash=sha256:3290aac36dcafbd1bc0baaf995522e2037f57056672b5a1516af112a4210f3ea \
--hash=sha256:7e514a6ffd92f237aceb062dadc4db44e24a3cd9c4ea35e25173d1e0739beb8e
markupsafe==3.0.3 \
--hash=sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f \
--hash=sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a \
Expand Down Expand Up @@ -637,9 +637,9 @@ multiprocess==0.70.19 \
nltk==3.9.4 \
--hash=sha256:ed03bc098a40481310320808b2db712d95d13ca65b27372f8a403949c8b523d0 \
--hash=sha256:f2fa301c3a12718ce4a0e9305c5675299da5ad9e26068218b69d692fda84828f
oci==2.169.0 \
--hash=sha256:c71bb5143f307791082b3e33cc1545c2490a518cfed85ab1948ef5107c36d30b \
--hash=sha256:f3c5fff00b01783b5325ea7b13bf140053ec1e9f41da20bfb9c8a349ee7662fa
oci==2.170.0 \
--hash=sha256:1e205a395e856b6514501d34595dd1e8e261415aaaab2c97f77fd2fc9618dcc3 \
--hash=sha256:49adf0ffaf754c07c43194d19d2e11f5e1b602a95bb365bb384d939ff365b1b4
openai==2.30.0 \
--hash=sha256:92f7661c990bda4b22a941806c83eabe4896c3094465030dd882a71abe80c885 \
--hash=sha256:9a5ae616888eb2748ec5e0c5b955a51592e0b201a11f4262db920f2a78c5231d
Expand Down Expand Up @@ -874,6 +874,9 @@ pyjwt==2.12.1 \
pyopenssl==26.0.0 \
--hash=sha256:df94d28498848b98cc1c0ffb8ef1e71e40210d3b0a8064c9d29571ed2904bf81 \
--hash=sha256:f293934e52936f2e3413b89c6ce36df66a0b34ae1ea3a053b8c5020ff2f513fc
pypdf==6.9.2 \
--hash=sha256:662cf29bcb419a36a1365232449624ab40b7c2d0cfc28e54f42eeecd1fd7e844 \
--hash=sha256:7f850faf2b0d4ab936582c05da32c52214c2b089d61a316627b5bfb5b0dab46c
pythainlp==5.3.4 \
--hash=sha256:76744e51e27c895630bafd74f53a1f0aa8782cef2f7f02eebd6427fe8ce8d84d \
--hash=sha256:e66fd76fb5931834fd4e32ed54337ec62350d7654f187850e4dd4f915e9f624f
Expand Down
5 changes: 0 additions & 5 deletions requirements.hashes.wheel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,6 @@ pandas==2.3.3 \
--hash=sha256:e3a18fae723b808514670a4a0172f9939cdbb095abd5eef1f34cf5ae1b99f424
peft==0.18.1 \
--hash=sha256:026817e68c93fcc0569360afa0ee4fb74b06b0a4268240f922bc2bc0a691bcc1
pillow==12.1.1 \
--hash=sha256:58c0a6787ac12753fba61035713f939f33946c970fe48a5756ed1a36c22d2e79 \
--hash=sha256:98ab177b9de8751ec5b1dbb7597b45c7edc358f7e16e5764ae93c976c6433f9f \
--hash=sha256:dd45829dc58e931ebef6a4b7eb122efef838a8d37848d5ec857a79d4c7e8f543 \
--hash=sha256:fcf5fa3497ec5c32843367d1133bc6b6d273e82d2fc86cd6d309cc09e7e457cf
prometheus-client==0.24.1 \
--hash=sha256:fe601e041eac55bad8f46da2f3c54f2ab6cd8a8272d9595742c83980e95ed5e4
prompt-toolkit==3.0.52 \
Expand Down
10 changes: 7 additions & 3 deletions src/app/endpoints/conversations_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any

from fastapi import APIRouter, Depends, HTTPException, Request
from llama_stack_api import ConversationNotFoundError
from llama_stack_client import (
APIConnectionError,
APIStatusError,
Expand Down Expand Up @@ -275,7 +276,8 @@ async def get_conversation_endpoint_handler( # pylint: disable=too-many-locals,
).model_dump()
raise HTTPException(**response) from e

except APIStatusError as e:
except (APIStatusError, ConversationNotFoundError) as e:
# In library mode, ConversationNotFoundError is raised instead of APIStatusError
logger.error("Conversation not found: %s", e)
response = NotFoundResponse(
resource="conversation", resource_id=normalized_conv_id
Expand Down Expand Up @@ -382,7 +384,8 @@ async def delete_conversation_endpoint_handler(
response = ServiceUnavailableResponse(backend_name="Llama Stack", cause=str(e))
raise HTTPException(**response.model_dump()) from e

except APIStatusError:
except (APIStatusError, ConversationNotFoundError):
# In library mode, ConversationNotFoundError is raised instead of APIStatusError
logger.warning(
"Conversation %s in LlamaStack not found. Treating as already deleted.",
normalized_conv_id,
Expand Down Expand Up @@ -519,7 +522,8 @@ async def update_conversation_endpoint_handler(
).model_dump()
raise HTTPException(**response) from e

except APIStatusError as e:
except (APIStatusError, ConversationNotFoundError) as e:
# In library mode, ConversationNotFoundError is raised instead of APIStatusError
logger.error("Conversation not found: %s", e)
response = NotFoundResponse(
resource="conversation", resource_id=normalized_conv_id
Expand Down
3 changes: 2 additions & 1 deletion src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from utils.query import (
consume_query_tokens,
handle_known_apistatus_errors,
is_context_length_error,
prepare_input,
store_query_results,
update_azure_token,
Expand Down Expand Up @@ -303,7 +304,7 @@ async def retrieve_response(
response = cast(OpenAIResponseObject, response)

except RuntimeError as e: # library mode wraps 413 into runtime error
if "context_length" in str(e).lower():
if is_context_length_error(str(e)):
error_response = PromptTooLongResponse(model=responses_params.model)
raise HTTPException(**error_response.model_dump()) from e
raise e
Expand Down
5 changes: 3 additions & 2 deletions src/app/endpoints/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
consume_query_tokens,
extract_provider_and_model_from_model_id,
handle_known_apistatus_errors,
is_context_length_error,
store_query_results,
update_azure_token,
validate_model_provider_override,
Expand Down Expand Up @@ -335,7 +336,7 @@ async def handle_streaming_response(
inline_rag_context=inline_rag_context,
)
except RuntimeError as e: # library mode wraps 413 into runtime error
if "context_length" in str(e).lower():
if is_context_length_error(str(e)):
error_response = PromptTooLongResponse(model=api_params.model)
raise HTTPException(**error_response.model_dump()) from e
raise e
Expand Down Expand Up @@ -696,7 +697,7 @@ async def handle_non_streaming_response(
)

except RuntimeError as e:
if "context_length" in str(e).lower():
if is_context_length_error(str(e)):
error_response = PromptTooLongResponse(model=api_params.model)
raise HTTPException(**error_response.model_dump()) from e
raise e
Expand Down
4 changes: 2 additions & 2 deletions src/app/endpoints/rlsapi_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from utils.query import (
extract_provider_and_model_from_model_id,
handle_known_apistatus_errors,
is_context_length_error,
)
from utils.responses import (
build_turn_summary,
Expand Down Expand Up @@ -531,8 +532,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
return HTTPException(**error_response.model_dump())

if isinstance(error, RuntimeError):
error_message = str(error).lower()
if "context_length" in error_message or "context length" in error_message:
if is_context_length_error(str(error)):
logger.error("Prompt too long for request %s: %s", request_id, error)
error_response = PromptTooLongResponse(model=model_id)
return HTTPException(**error_response.model_dump())
Expand Down
7 changes: 4 additions & 3 deletions src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
consume_query_tokens,
extract_provider_and_model_from_model_id,
handle_known_apistatus_errors,
is_context_length_error,
prepare_input,
store_query_results,
update_azure_token,
Expand Down Expand Up @@ -354,7 +355,7 @@ async def retrieve_response_generator(
)
# Handle know LLS client errors only at stream creation time and shield execution
except RuntimeError as e: # library mode wraps 413 into runtime error
if "context_length" in str(e).lower():
if is_context_length_error(str(e)):
error_response = PromptTooLongResponse(model=responses_params.model)
raise HTTPException(**error_response.model_dump()) from e
raise e
Expand Down Expand Up @@ -590,7 +591,7 @@ async def generate_response(
except RuntimeError as e: # library mode wraps 413 into runtime error
error_response = (
PromptTooLongResponse(model=responses_params.model)
if "context_length" in str(e).lower()
if is_context_length_error(str(e))
else InternalServerErrorResponse.generic()
)
yield stream_http_error_event(error_response, context.query_request.media_type)
Expand Down Expand Up @@ -835,7 +836,7 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat
)
error_response = (
PromptTooLongResponse(model=context.model_id)
if "context_length" in error_message.lower()
if is_context_length_error(error_message)
else InternalServerErrorResponse.query_failed(error_message)
)
yield stream_http_error_event(error_response, media_type)
Expand Down
2 changes: 1 addition & 1 deletion src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Minimal and maximal supported Llama Stack version
MINIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.2.17"
MAXIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.5.2"
MAXIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.6.0"

UNABLE_TO_PROCESS_RESPONSE = "Unable to process this request"

Expand Down
18 changes: 14 additions & 4 deletions src/utils/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@
logger = get_logger(__name__)


def is_context_length_error(error_message: str) -> bool:
"""Check if an error message indicates a context length exceeded error.

Args:
error_message: The error message to check.

Returns:
True if the error indicates context length was exceeded.
"""
msg_lower = error_message.lower()
return "context_length" in msg_lower or "context length" in msg_lower


def store_conversation_into_cache(
user_id: str,
conversation_id: str,
Expand Down Expand Up @@ -578,10 +591,7 @@ def handle_known_apistatus_errors(
"""
if error.status_code == 400:
error_message = getattr(error, "message", str(error))
if (
"context_length" in error_message.lower()
or "context length" in error_message.lower()
):
if is_context_length_error(error_message):
return PromptTooLongResponse(model=model_id)
elif error.status_code == 429:
return QuotaExceededResponse.model(model_id)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/features/info.feature
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Feature: Info tests
When I access REST API endpoint "info" using HTTP GET method
Then The status code of the response is 200
And The body of the response has proper name Lightspeed Core Service (LCS) and version 0.5.0
And The body of the response has llama-stack version 0.5.2
And The body of the response has llama-stack version 0.6.0

@skip-in-library-mode
Scenario: Check if info endpoint reports error when llama-stack connection is not working
Expand Down
Loading
Loading