diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 573365b..72f7fe9 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -3,7 +3,14 @@ from importlib import metadata from typing import TYPE_CHECKING -from langchain_apify._actor_tools import ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify._actor_tools import ( + ApifyEcommerceScraperTool, + ApifyGoogleMapsTool, + ApifyGoogleSearchTool, + ApifyRAGWebBrowserTool, + ApifyWebCrawlerTool, + ApifyYouTubeScraperTool, +) from langchain_apify.document_loaders import ApifyCrawlLoader, ApifyDatasetLoader from langchain_apify.retrievers import ApifySearchRetriever from langchain_apify.tools import ( @@ -42,12 +49,18 @@ APIFY_SEARCH_TOOLS: list[type[BaseTool]] = [ ApifyGoogleSearchTool, ApifyWebCrawlerTool, + ApifyRAGWebBrowserTool, + ApifyGoogleMapsTool, + ApifyYouTubeScraperTool, + ApifyEcommerceScraperTool, ] __all__ = [ # Existing components (backward-compatible) 'ApifyActorsTool', + 'ApifyCrawlLoader', 'ApifyDatasetLoader', + 'ApifySearchRetriever', 'ApifyWrapper', # Core generic tools 'ApifyGetDatasetItemsTool', @@ -56,16 +69,16 @@ 'ApifyRunTaskAndGetDatasetTool', 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', - # Actor-specific tools + # Search & crawling tools 'ApifyGoogleSearchTool', 'ApifyWebCrawlerTool', - # Retriever - 'ApifySearchRetriever', - # Loaders - 'ApifyCrawlLoader', + 'ApifyRAGWebBrowserTool', + 'ApifyGoogleMapsTool', + 'ApifyYouTubeScraperTool', + 'ApifyEcommerceScraperTool', # Tool group lists - 'APIFY_SEARCH_TOOLS', 'APIFY_CORE_TOOLS', + 'APIFY_SEARCH_TOOLS', # Meta '__version__', ] diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 998c0fe..7473d9c 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -8,29 +8,37 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from langchain_core.tools import ToolException -from pydantic import BaseModel # noqa: TCH002 +from pydantic import BaseModel, Field from langchain_apify._client import ( _DEFAULT_CRAWLER_TYPE, _DEFAULT_GOOGLE_MAX_RESULTS, _DEFAULT_MAX_CRAWL_DEPTH, _DEFAULT_MAX_CRAWL_PAGES, + _DEFAULT_RAG_MAX_RESULTS, _DEFAULT_RUN_TIMEOUT_SECS, ) from langchain_apify._types import CrawlerType # noqa: TCH001 # runtime-needed: shared Literal alias -from langchain_apify._utils import _extract_content, _safe_title +from langchain_apify._utils import _extract_content, _extract_source, _safe_title from langchain_apify.tools import ( ApifyGoogleSearchInput, ApifyWebCrawlerInput, _ApifyGenericTool, + _run_meta, ) if TYPE_CHECKING: from langchain_core.callbacks import CallbackManagerForToolRun +# Per-tool result limits not shared via ``_client`` (Maps/YouTube/Ecommerce). +_DEFAULT_GOOGLE_MAPS_MAX_RESULTS = 10 +_DEFAULT_YOUTUBE_MAX_RESULTS = 10 +_DEFAULT_ECOMMERCE_MAX_RESULTS = 20 + + # --------------------------------------------------------------------------- # Search & Crawling tools # --------------------------------------------------------------------------- @@ -173,3 +181,298 @@ def _run( if isinstance(item, dict) ] return json.dumps({'run': None, 'items': pages}, default=str) + + +# --------------------------------------------------------------------------- +# Input schemas (US-4 Search & Crawling Actor tools) +# --------------------------------------------------------------------------- + + +class ApifyRAGWebBrowserInput(BaseModel): + """Input schema for :class:`ApifyRAGWebBrowserTool`.""" + + query: str = Field(description='Search query string.') + max_results: int = Field(default=_DEFAULT_RAG_MAX_RESULTS, description='Maximum number of results to return.') + + +class ApifyGoogleMapsInput(BaseModel): + """Input schema for :class:`ApifyGoogleMapsTool`.""" + + query: str = Field(description='Search query (e.g. "coffee shops in Berlin").') + max_results: int = Field( + default=_DEFAULT_GOOGLE_MAPS_MAX_RESULTS, description='Maximum number of places to return.' + ) + language: str | None = Field( + default=None, + description='Optional ISO language code for results (e.g. "en", "de").', + ) + + +class ApifyYouTubeScraperInput(BaseModel): + """Input schema for :class:`ApifyYouTubeScraperTool`.""" + + search_query: str = Field( + description=('Keyword for "search" mode, or a video/channel URL for "video"/"channel" modes.'), + ) + search_type: Literal['search', 'video', 'channel'] = Field( + default='search', + description='Scrape mode: search keyword, single video URL, or channel URL.', + ) + max_results: int = Field(default=_DEFAULT_YOUTUBE_MAX_RESULTS, description='Maximum number of items to return.') + + +class ApifyEcommerceScraperInput(BaseModel): + """Input schema for :class:`ApifyEcommerceScraperTool`.""" + + url: str = Field(description='Product-detail URL or category / listing page URL to scrape.') + url_type: Literal['product', 'category'] = Field( + default='product', + description=( + 'Type of page the URL points to: "product" for a product-detail page, ' + '"category" for a category / listing page.' + ), + ) + max_results: int = Field( + default=_DEFAULT_ECOMMERCE_MAX_RESULTS, description='Maximum number of products to return.' + ) + + +# --------------------------------------------------------------------------- +# Tools (US-4 Search & Crawling Actor tools) +# --------------------------------------------------------------------------- + + +class ApifyRAGWebBrowserTool(_ApifyGenericTool): # type: ignore[override] + """Search the web and return content from top results. + + Wraps the ``apify/rag-web-browser`` Actor. Unlike + :class:`ApifySearchRetriever` (which returns LangChain ``Document`` + objects for RAG pipelines), this tool returns a JSON envelope + suitable for agent tool-calling. + + Args: + apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"run": {...}, "items": [{"url", "title", "content"}]}``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRAGWebBrowserTool + + tool = ApifyRAGWebBrowserTool() + result = tool.invoke({"query": "what is LangChain?", "max_results": 3}) + """ + + name: str = 'apify_rag_web_browser' + description: str = ( + 'Search the web and return a JSON envelope with crawled results.' + ' Each item has keys: url, title, content.' + ' Required: query (str) - the search query.' + f' Optional: max_results (int, default {_DEFAULT_RAG_MAX_RESULTS}).' + ' Returns keys: run, items.' + ) + args_schema: type[BaseModel] = ApifyRAGWebBrowserInput + + def _run( + self, + query: str, + max_results: int = _DEFAULT_RAG_MAX_RESULTS, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.rag_web_search( + query, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + results = [ + { + 'url': _extract_source(item), + 'title': _safe_title(item), + 'content': _extract_content(item), + } + for item in items + if isinstance(item, dict) + ] + return json.dumps({'run': _run_meta(run), 'items': results}, default=str) + + +class ApifyGoogleMapsTool(_ApifyGenericTool): # type: ignore[override] + """Search Google Maps for places, reviews, and business details. + + Wraps the ``compass/crawler-google-places`` Actor. + + Args: + apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"run": {...}, "items": [...]}`` where ``run`` holds + ``run_id``, ``status``, ``dataset_id``, ``started_at``, ``finished_at`` + and ``items`` are place dicts. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyGoogleMapsTool + + tool = ApifyGoogleMapsTool() + result = tool.invoke({"query": "coffee shops in Berlin", "max_results": 5}) + """ + + name: str = 'apify_google_maps' + description: str = ( + 'Search Google Maps places, reviews, and business details and return a JSON envelope.' + ' Required: query (str) - the search query.' + f' Optional: max_results (int, default {_DEFAULT_GOOGLE_MAPS_MAX_RESULTS}),' + ' language (str|null - ISO code, e.g. "en").' + ' Returns keys: run, items.' + ) + args_schema: type[BaseModel] = ApifyGoogleMapsInput + + def _run( + self, + query: str, + max_results: int = _DEFAULT_GOOGLE_MAPS_MAX_RESULTS, + language: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.google_maps_search( + query, + max_results=self._clamp_items(max_results), + language=language, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}, default=str) + + +class ApifyYouTubeScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape YouTube videos, channels, or search results. + + Wraps the ``streamers/youtube-scraper`` Actor. + + Args: + apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"run": {...}, "items": [...]}`` where ``run`` holds + ``run_id``, ``status``, ``dataset_id``, ``started_at``, ``finished_at`` + and ``items`` are video / channel dicts. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyYouTubeScraperTool + + tool = ApifyYouTubeScraperTool() + result = tool.invoke({ + "search_query": "langchain tutorial", + "search_type": "search", + "max_results": 5, + }) + """ + + name: str = 'apify_youtube_scraper' + description: str = ( + 'Scrape YouTube by keyword, video URL, or channel URL and return a JSON envelope.' + ' Required: search_query (str - keyword for "search" mode, or a video/channel URL).' + ' Optional: search_type (one of "search", "video", "channel"; default "search"),' + f' max_results (int, default {_DEFAULT_YOUTUBE_MAX_RESULTS}).' + ' Returns keys: run, items.' + ) + args_schema: type[BaseModel] = ApifyYouTubeScraperInput + + def _run( + self, + search_query: str, + search_type: Literal['search', 'video', 'channel'] = 'search', + max_results: int = _DEFAULT_YOUTUBE_MAX_RESULTS, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.youtube_scrape( + search_query=search_query, + search_type=search_type, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}, default=str) + + +class ApifyEcommerceScraperTool(_ApifyGenericTool): # type: ignore[override] + """Extract product or listing data from an e-commerce URL. + + Wraps the ``apify/e-commerce-scraping-tool`` Actor. + + Args: + apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"run": {...}, "items": [...]}`` where ``run`` holds + ``run_id``, ``status``, ``dataset_id``, ``started_at``, ``finished_at`` + and ``items`` are product / listing dicts. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyEcommerceScraperTool + + tool = ApifyEcommerceScraperTool() + result = tool.invoke({ + "url": "https://shop.example.com/category/123", + "url_type": "category", + "max_results": 20, + }) + """ + + name: str = 'apify_ecommerce_scraper' + description: str = ( + 'Extract product data from an e-commerce URL and return a JSON envelope.' + ' Required: url (str) - product-detail or category / listing URL.' + ' Optional: url_type (one of "product", "category"; default "product"),' + f' max_results (int, default {_DEFAULT_ECOMMERCE_MAX_RESULTS}).' + ' Returns keys: run, items.' + ) + args_schema: type[BaseModel] = ApifyEcommerceScraperInput + + def _run( + self, + url: str, + url_type: Literal['product', 'category'] = 'product', + max_results: int = _DEFAULT_ECOMMERCE_MAX_RESULTS, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.ecommerce_scrape( + url, + url_type=url_type, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}, default=str) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index a151312..98855da 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -24,6 +24,12 @@ _WEBSITE_CONTENT_CRAWLER_ACTOR_ID = 'apify/website-content-crawler' _GOOGLE_SEARCH_ACTOR_ID = 'apify/google-search-scraper' _RAG_WEB_BROWSER_ACTOR_ID = 'apify/rag-web-browser' +_GOOGLE_MAPS_ACTOR_ID = 'compass/crawler-google-places' +_YOUTUBE_SCRAPER_ACTOR_ID = 'streamers/youtube-scraper' +_ECOMMERCE_SCRAPER_ACTOR_ID = 'apify/e-commerce-scraping-tool' + +_YOUTUBE_SEARCH_TYPES = ('search', 'video', 'channel') +_ECOMMERCE_URL_TYPES = ('product', 'category') _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 _DEFAULT_DATASET_ITEMS_LIMIT = 100 @@ -223,8 +229,10 @@ def run_task_and_get_items( items = self._list_items_or_raise(dataset_id, dataset_items_limit) return run, items - def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: - """Scrape a single URL and return its content as markdown. + def _scrape_url( + self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS + ) -> tuple[dict, list[dict], str, str]: + """Scrape a single URL and return run/items/content metadata. Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. @@ -233,7 +241,8 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) timeout_secs: Maximum time to wait for the crawl to finish. Returns: - Markdown (or plain-text fallback) content of the page. + Tuple: ``(run, items, content, content_source)`` where + ``content_source`` is ``"markdown"`` or ``"text"``. Raises: RuntimeError: If the Actor run fails or no content is extracted. @@ -242,7 +251,7 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) 'startUrls': [{'url': url}], 'maxCrawlPages': 1, } - _, items = self.run_actor_and_get_items( + run, items = self.run_actor_and_get_items( _WEBSITE_CONTENT_CRAWLER_ACTOR_ID, run_input=run_input, timeout_secs=timeout_secs, @@ -252,10 +261,20 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) + markdown = items[0].get('markdown') or '' content = _extract_content(items[0]) if not content: msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) + return run, items, content, 'markdown' if markdown else 'text' + + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: + """Scrape a single URL and return only the page content. + + Thin public wrapper over :meth:`_scrape_url` for callers that don't need + the run/items metadata. + """ + _, _, content, _ = self._scrape_url(url=url, timeout_secs=timeout_secs) return content def google_search( @@ -284,10 +303,12 @@ def google_search( Raises: RuntimeError: If the Actor run fails. """ + # apify/google-search-scraper has no resultsPerPage input; result count + # is driven by maxPagesPerQuery (~10 results/page). Request enough pages + # to cover max_results, then slice the flattened results below. run_input: dict = { 'queries': query, - 'maxPagesPerQuery': 1, - 'resultsPerPage': max_results, + 'maxPagesPerQuery': max(1, (max_results + 9) // 10), } if country_code is not None: run_input['countryCode'] = country_code @@ -316,7 +337,7 @@ def rag_web_search( query: str, max_results: int = _DEFAULT_RAG_MAX_RESULTS, timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, - ) -> list[dict]: + ) -> tuple[dict, list[dict]]: """Search the web and return crawled page content for RAG. Uses ``apify/rag-web-browser``. @@ -327,8 +348,9 @@ def rag_web_search( timeout_secs: Maximum time to wait for the run to finish. Returns: - List of result dicts with ``crawledUrl``, ``title``, and - ``text`` keys (among others from the Actor). + A ``(run_details, items)`` tuple. Each item dict has at least + ``crawledUrl``, ``text``, and a nested ``metadata`` block (among + other keys returned by the Actor). Raises: RuntimeError: If the Actor run fails. @@ -337,13 +359,135 @@ def rag_web_search( 'query': query, 'maxResults': max_results, } - _, items = self.run_actor_and_get_items( + return self.run_actor_and_get_items( _RAG_WEB_BROWSER_ACTOR_ID, run_input=run_input, timeout_secs=timeout_secs, dataset_items_limit=max_results, ) - return items + + def google_maps_search( + self, + query: str, + max_results: int = 10, + language: str | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Search Google Maps places, reviews, and business details. + + Uses ``compass/crawler-google-places``. + + Args: + query: Search query string (e.g. ``"coffee shops in Berlin"``). + max_results: Maximum number of places to return. + language: Optional ISO language code for results (e.g. ``"en"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple where each item is a place dict. + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'searchStringsArray': [query], + 'maxCrawledPlacesPerSearch': max_results, + } + if language is not None: + run_input['language'] = language + + return self.run_actor_and_get_items( + _GOOGLE_MAPS_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def youtube_scrape( + self, + search_query: str, + search_type: str = 'search', + max_results: int = 10, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape YouTube videos, channels, or search results. + + Uses ``streamers/youtube-scraper``. + + Args: + search_query: Keyword for ``search`` mode, or a video/channel URL + for ``video``/``channel`` modes. + search_type: One of ``"search"``, ``"video"``, ``"channel"``. + max_results: Maximum number of items to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_type`` is not a supported value. + RuntimeError: If the Actor run fails. + """ + if search_type not in _YOUTUBE_SEARCH_TYPES: + msg = f'Invalid search_type {search_type!r}; expected one of {_YOUTUBE_SEARCH_TYPES}.' + raise ValueError(msg) + + run_input: dict = {'maxResults': max_results} + if search_type == 'search': + run_input['searchQueries'] = [search_query] + else: + run_input['startUrls'] = [{'url': search_query}] + + return self.run_actor_and_get_items( + _YOUTUBE_SCRAPER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def ecommerce_scrape( + self, + url: str, + url_type: str = 'product', + max_results: int = 20, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Extract product data from an e-commerce URL. + + Uses ``apify/e-commerce-scraping-tool``. ``url_type`` selects which + Actor input field the URL is sent as: ``"product"`` -> ``detailsUrls`` + (a single product-detail page), ``"category"`` -> ``listingUrls`` + (a category / listing page that the Actor will expand into product + results). + + Args: + url: Product-detail or category / listing URL to scrape. + url_type: One of ``"product"`` or ``"category"``. + max_results: Maximum number of products to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``url_type`` is not a supported value. + RuntimeError: If the Actor run fails. + """ + if url_type not in _ECOMMERCE_URL_TYPES: + msg = f'Invalid url_type {url_type!r}; expected one of {_ECOMMERCE_URL_TYPES}.' + raise ValueError(msg) + + input_key = 'detailsUrls' if url_type == 'product' else 'listingUrls' + run_input: dict = { + input_key: [{'url': url}], + 'maxProductResults': max_results, + } + return self.run_actor_and_get_items( + _ECOMMERCE_SCRAPER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) def crawl_website( self, diff --git a/langchain_apify/_utils.py b/langchain_apify/_utils.py index 9250f39..f28d90d 100644 --- a/langchain_apify/_utils.py +++ b/langchain_apify/_utils.py @@ -83,15 +83,36 @@ def _extract_content(item: dict) -> str: return item.get('markdown') or item.get('text') or '' +def _item_metadata(item: dict) -> dict: + """Return an item's ``metadata`` block, or ``{}`` if missing/non-dict. + + Some Actors surface a ``null`` (or otherwise non-dict) ``metadata`` value, + so a plain ``item.get('metadata', {})`` would raise ``AttributeError`` on + the chained ``.get(...)``. + """ + meta = item.get('metadata') + return meta if isinstance(meta, dict) else {} + + def _safe_title(item: dict) -> str: """Return an Actor item's title from its nested ``metadata`` object. Both ``apify/website-content-crawler`` and ``apify/rag-web-browser`` nest - the page title under ``metadata.title``. The ``isinstance`` guard tolerates - Actor responses where ``metadata`` is missing or not a dict. + the page title under ``metadata.title``. The guard tolerates Actor + responses where ``metadata`` is missing or not a dict. + """ + return _item_metadata(item).get('title', '') + + +def _extract_source(item: dict) -> str: + """Return an Actor item's source URL via one canonical fallback order. + + ``apify/rag-web-browser`` items expose the page URL in several places. To + keep every consumer (RAG tool, retriever, loaders) in agreement, the order + is fixed here: nested ``metadata.url`` first, then ``crawledUrl``, then the + top-level ``url``. """ - metadata = item.get('metadata') - return metadata.get('title', '') if isinstance(metadata, dict) else '' + return _item_metadata(item).get('url') or item.get('crawledUrl') or item.get('url', '') def _prune_actor_input_schema( diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 6b4b09c..6ae737c 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -35,7 +35,8 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): To use, you should have the environment variable ``APIFY_TOKEN`` set with your API key, or pass ``apify_token`` as a named parameter to the - constructor. + constructor. ``APIFY_API_TOKEN`` is still accepted for backwards + compatibility. For details, see https://docs.apify.com/platform/integrations/langchain @@ -102,6 +103,9 @@ def __init__( def _init_client(self) -> ApifyDatasetLoader: """Validate the resolved Apify token and initialise the client. + The token default factory resolves ``APIFY_TOKEN`` first and + ``APIFY_API_TOKEN`` as a legacy fallback. + Returns: ApifyDatasetLoader: The validated loader instance. @@ -147,6 +151,7 @@ class ApifyCrawlLoader(BaseLoader): url: Seed URL to start crawling from. apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` environment variable when *None*. + apify_api_token: Deprecated alias for ``apify_token``. max_crawl_pages: Maximum number of pages to crawl. max_crawl_depth: Maximum link-follow depth from the seed URL. crawler_type: Crawler engine (e.g. ``"cheerio"``, ``"playwright:firefox"``). diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py index dfe1094..146f3d0 100644 --- a/langchain_apify/retrievers.py +++ b/langchain_apify/retrievers.py @@ -13,6 +13,7 @@ from langchain_apify._utils import ( _apify_token_secret_factory, _extract_content, + _extract_source, _resolve_deprecated_token_values, _safe_title, ) @@ -85,7 +86,7 @@ def _get_relevant_documents( *, run_manager: CallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: - items = self._client.rag_web_search( + _, items = self._client.rag_web_search( query, max_results=self.max_results, timeout_secs=self.timeout_secs, @@ -99,7 +100,7 @@ async def _aget_relevant_documents( run_manager: AsyncCallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: # ApifyToolsClient is sync-only. - items = await asyncio.to_thread( + _, items = await asyncio.to_thread( self._client.rag_web_search, query, max_results=self.max_results, @@ -113,12 +114,8 @@ def _items_to_documents(items: list[dict]) -> list[Document]: docs: list[Document] = [] for item in items: page_content = _extract_content(item) - raw_meta = item.get('metadata') - item_metadata: dict = raw_meta if isinstance(raw_meta, dict) else {} metadata: dict[str, Any] = { - # apify/rag-web-browser nests url/title under "metadata"; older - # Actors and tests use top-level keys. Both are supported. - 'source': item.get('crawledUrl') or item.get('url') or item_metadata.get('url', ''), + 'source': _extract_source(item), 'title': _safe_title(item), } docs.append(Document(page_content=page_content, metadata=metadata)) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 8c13787..266614f 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -476,7 +476,7 @@ class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] name: str = 'apify_run_actor' description: str = ( - 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + 'Run an Apify Actor synchronously and return a JSON envelope.' ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' f' Optional: run_input (dict), timeout_secs (int, default {_DEFAULT_RUN_TIMEOUT_SECS}),' ' memory_mbytes (int|null).' @@ -555,17 +555,16 @@ class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[overrid """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` - into a single call. Returns a JSON string with ``run`` (metadata) and - ``items`` (list of dicts) keys. + into a single call. Returns a JSON envelope. Args: apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` environment variable when *None*. Returns: - JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, - ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list - of dataset item dicts). + JSON object ``{"run": {...}, "items": [...]}`` where ``run`` holds + ``run_id``, ``status``, ``dataset_id``, ``started_at``, ``finished_at`` + and ``items`` are the dataset item dicts. Example: .. code-block:: python @@ -584,7 +583,7 @@ class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[overrid name: str = 'apify_run_actor_and_get_dataset' description: str = ( - 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + 'Run an Apify Actor synchronously and return a JSON envelope.' ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' f' Optional: run_input (dict), timeout_secs (int, default {_DEFAULT_RUN_TIMEOUT_SECS}),' f' memory_mbytes (int|null), dataset_items_limit (int, default {_DEFAULT_DATASET_ITEMS_LIMIT}).' @@ -627,7 +626,7 @@ class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] environment variable when *None*. Returns: - JSON object ``{"run": null, "items": [{"url": ..., "content": ...}]}``. + JSON object ``{"run": {...}, "items": [{"url": ..., "content": ...}]}``. Example: .. code-block:: python @@ -646,7 +645,7 @@ class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] 'Scrape a single URL using Apify and return a JSON envelope.' ' Required: url (str) — the URL to scrape.' f' Optional: timeout_secs (int, default {_DEFAULT_SCRAPE_TIMEOUT_SECS}).' - ' Returns JSON with keys: run (null), items ([{url, content}];' + ' Returns JSON with keys: run, items ([{url, content}];' ' content is markdown, or plain text when markdown is unavailable).' ) args_schema: type[BaseModel] = ApifyScrapeUrlInput @@ -658,10 +657,12 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - content = self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) + # _scrape_url is the rich primitive; scrape_url() drops the metadata + # this tool needs (run + content source), so access it directly. + run, _, content, _ = self._client._scrape_url(url, self._clamp_timeout(timeout_secs)) # noqa: SLF001 except RuntimeError as exc: raise ToolException(str(exc)) from exc - return json.dumps({'run': None, 'items': [{'url': url, 'content': content}]}, default=str) + return json.dumps({'run': _run_meta(run), 'items': [{'url': url, 'content': content}]}, default=str) class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] @@ -697,7 +698,7 @@ class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] name: str = 'apify_run_task' description: str = ( - 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + 'Run a saved Apify Actor task synchronously and return a JSON envelope.' ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' f' Optional: task_input (dict), timeout_secs (int, default {_DEFAULT_RUN_TIMEOUT_SECS}),' ' memory_mbytes (int|null).' @@ -727,17 +728,16 @@ class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` - into a single call. Returns a JSON string with ``run`` (metadata) and - ``items`` (list of dicts) keys. + into a single call. Returns a JSON envelope. Args: apify_token: Apify API token. Falls back to the ``APIFY_TOKEN`` environment variable when *None*. Returns: - JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, - ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list - of dataset item dicts). + JSON object ``{"run": {...}, "items": [...]}`` where ``run`` holds + ``run_id``, ``status``, ``dataset_id``, ``started_at``, ``finished_at`` + and ``items`` are the dataset item dicts. Example: .. code-block:: python @@ -756,7 +756,7 @@ class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override name: str = 'apify_run_task_and_get_dataset' description: str = ( - 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + 'Run a saved Apify Actor task synchronously and return a JSON envelope.' ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' f' Optional: task_input (dict), timeout_secs (int, default {_DEFAULT_RUN_TIMEOUT_SECS}),' f' memory_mbytes (int|null), dataset_items_limit (int, default {_DEFAULT_DATASET_ITEMS_LIMIT}).' diff --git a/tests/integration_tests/test_document_loaders.py b/tests/integration_tests/test_document_loaders.py index 674eeba..96eb4d1 100644 --- a/tests/integration_tests/test_document_loaders.py +++ b/tests/integration_tests/test_document_loaders.py @@ -1,11 +1,17 @@ from collections.abc import Iterator +import pytest from apify_client import ApifyClient from langchain_core.documents import Document from langchain_apify import ApifyDatasetLoader from langchain_apify._utils import _resolve_apify_token +pytestmark = pytest.mark.skipif( + not _resolve_apify_token(), + reason='APIFY_TOKEN not set', +) + def test_apify_dataset_loader_load() -> None: """Tests the ApifyDatasetLoader.load method. diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py index 60a9dde..787e9df 100644 --- a/tests/integration_tests/test_generic_tools.py +++ b/tests/integration_tests/test_generic_tools.py @@ -68,8 +68,9 @@ def test_scrape_url_tool_smoke() -> None: tool = ApifyScrapeUrlTool() result = tool.invoke({'url': 'https://crawlee.dev'}) - assert isinstance(result, str) - assert len(result) > 0 + parsed = json.loads(result) + assert parsed['content'] + assert parsed['meta']['content_length'] > 0 _TASK_ID = os.getenv('APIFY_TASK_ID', '') diff --git a/tests/integration_tests/test_tools.py b/tests/integration_tests/test_tools.py index 084cbb6..6b4369e 100644 --- a/tests/integration_tests/test_tools.py +++ b/tests/integration_tests/test_tools.py @@ -3,13 +3,20 @@ import json from typing import TYPE_CHECKING +import pytest from langchain_tests.integration_tests import ToolsIntegrationTests +from langchain_apify._utils import _resolve_apify_token from langchain_apify.tools import ApifyActorsTool if TYPE_CHECKING: from langchain_core.tools import BaseTool +pytestmark = pytest.mark.skipif( + not _resolve_apify_token(), + reason='APIFY_TOKEN not set', +) + class TestApifyActorsToolIntegration(ToolsIntegrationTests): """Integration tests for the ApifyActorsTool. diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 28ae02c..9d6bc9c 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -1,3 +1,4 @@ +import pytest from apify_client.client import ApifyClient from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -11,8 +12,7 @@ def test_get_actor_latest_build() -> None: ValueError: If the APIFY_TOKEN environment variable is not set. """ if (token := _resolve_apify_token()) is None: - msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET - raise ValueError(msg) + pytest.skip(_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET) apify_client = _create_apify_client(ApifyClient, token) diff --git a/tests/integration_tests/test_wrappers.py b/tests/integration_tests/test_wrappers.py index 73a8f40..3c2ee19 100644 --- a/tests/integration_tests/test_wrappers.py +++ b/tests/integration_tests/test_wrappers.py @@ -1,6 +1,13 @@ +import pytest from langchain_core.documents import Document from langchain_apify import ApifyWrapper +from langchain_apify._utils import _resolve_apify_token + +pytestmark = pytest.mark.skipif( + not _resolve_apify_token(), + reason='APIFY_TOKEN not set', +) def test_apify_wrapper_call_actor() -> None: diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 1666dfd..cdb3147 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -7,10 +7,18 @@ from langchain_core.tools import ToolException from pydantic import SecretStr -from langchain_apify import APIFY_SEARCH_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify import ( + APIFY_SEARCH_TOOLS, + ApifyEcommerceScraperTool, + ApifyGoogleMapsTool, + ApifyGoogleSearchTool, + ApifyRAGWebBrowserTool, + ApifyWebCrawlerTool, + ApifyYouTubeScraperTool, +) from langchain_apify._client import ApifyToolsClient from langchain_apify.tools import _ApifyGenericTool -from tests.unit_tests.conftest import make_tool +from tests.unit_tests.conftest import SUCCEEDED_RUN, make_tool # --------------------------------------------------------------------------- # ApifyGoogleSearchTool @@ -73,7 +81,8 @@ def test_google_search_tool_empty_results(mock_tools_client: MagicMock) -> None: result = tool._run(query='nothing') - assert json.loads(result) == {'run': None, 'items': []} + parsed = json.loads(result) + assert parsed == {'run': None, 'items': []} def test_google_search_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: @@ -194,7 +203,8 @@ def test_web_crawler_tool_empty_results(mock_tools_client: MagicMock) -> None: result = tool._run(url='https://example.com') - assert json.loads(result) == {'run': None, 'items': []} + parsed = json.loads(result) + assert parsed == {'run': None, 'items': []} def test_web_crawler_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: @@ -213,30 +223,299 @@ def test_web_crawler_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None # --------------------------------------------------------------------------- -# Metadata & inheritance +# Search & Crawling tools — happy paths +# --------------------------------------------------------------------------- + + +def test_rag_web_browser_tool_returns_json(mock_tools_client: MagicMock) -> None: + items = [ + { + 'crawledUrl': 'https://example.com/1', + 'metadata': {'url': 'https://example.com/1', 'title': 'Page 1'}, + 'markdown': '# Page 1', + 'text': 'Page 1 plain', + }, + { + 'crawledUrl': 'https://example.com/2', + 'metadata': {'title': 'Page 2'}, + 'text': 'Page 2 plain', + }, + ] + mock_tools_client.rag_web_search.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyRAGWebBrowserTool, mock_tools_client) + + parsed = json.loads(tool._run(query='what is langchain', max_results=3)) + + assert parsed['items'] == [ + {'url': 'https://example.com/1', 'title': 'Page 1', 'content': '# Page 1'}, + {'url': 'https://example.com/2', 'title': 'Page 2', 'content': 'Page 2 plain'}, + ] + assert parsed['run']['status'] == 'SUCCEEDED' + mock_tools_client.rag_web_search.assert_called_once_with( + 'what is langchain', + max_results=3, + timeout_secs=tool.max_timeout_secs, + ) + + +def test_google_maps_tool_returns_json(mock_tools_client: MagicMock) -> None: + items = [{'name': 'Cafe A', 'address': 'Berlin'}] + mock_tools_client.google_maps_search.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyGoogleMapsTool, mock_tools_client) + + parsed = json.loads(tool._run(query='cafe in Berlin', max_results=2, language='en')) + + assert parsed['run']['dataset_id'] == SUCCEEDED_RUN['defaultDatasetId'] + assert parsed['items'] == items + mock_tools_client.google_maps_search.assert_called_once_with( + 'cafe in Berlin', + max_results=2, + language='en', + timeout_secs=tool.max_timeout_secs, + ) + + +def test_youtube_tool_returns_json(mock_tools_client: MagicMock) -> None: + items = [{'title': 'Vid 1'}] + mock_tools_client.youtube_scrape.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyYouTubeScraperTool, mock_tools_client) + + parsed = json.loads(tool._run(search_query='langchain', search_type='search', max_results=4)) + + assert parsed['items'] == items + mock_tools_client.youtube_scrape.assert_called_once_with( + search_query='langchain', + search_type='search', + max_results=4, + timeout_secs=tool.max_timeout_secs, + ) + + +def test_youtube_tool_invalid_search_type_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.youtube_scrape.side_effect = ValueError('Invalid search_type playlist') + tool = make_tool(ApifyYouTubeScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Invalid search_type'): + tool._run(search_query='x', search_type='search') + + +def test_ecommerce_tool_returns_json(mock_tools_client: MagicMock) -> None: + items = [{'sku': 'A1', 'price': 9.99}] + mock_tools_client.ecommerce_scrape.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyEcommerceScraperTool, mock_tools_client) + + parsed = json.loads(tool._run(url='https://shop.example.com/p/123', max_results=5)) + + assert parsed['items'] == items + mock_tools_client.ecommerce_scrape.assert_called_once_with( + 'https://shop.example.com/p/123', + url_type='product', + max_results=5, + timeout_secs=tool.max_timeout_secs, + ) + + +def test_ecommerce_tool_category_mode_passes_url_type(mock_tools_client: MagicMock) -> None: + items = [{'sku': 'B2', 'price': 19.99}] + mock_tools_client.ecommerce_scrape.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyEcommerceScraperTool, mock_tools_client) + + parsed = json.loads(tool._run(url='https://shop.example.com/cat/42', url_type='category', max_results=8)) + + assert parsed['items'] == items + mock_tools_client.ecommerce_scrape.assert_called_once_with( + 'https://shop.example.com/cat/42', + url_type='category', + max_results=8, + timeout_secs=tool.max_timeout_secs, + ) + + +def test_ecommerce_tool_invalid_url_type_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.ecommerce_scrape.side_effect = ValueError('Invalid url_type listing') + tool = make_tool(ApifyEcommerceScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Invalid url_type'): + tool._run(url='https://shop.example.com', url_type='product') + + +# --------------------------------------------------------------------------- +# US-4 Search & Crawling tools — parametrized error / empty / handle_tool_error # --------------------------------------------------------------------------- +# Each entry: (tool_class, helper_attribute_name, kwargs_for_run) +_TOOL_INVOCATIONS: list[tuple[type[_ApifyGenericTool], str, dict]] = [ + (ApifyGoogleSearchTool, 'google_search', {'query': 'q'}), + (ApifyWebCrawlerTool, 'crawl_website', {'url': 'https://example.com'}), + (ApifyRAGWebBrowserTool, 'rag_web_search', {'query': 'q'}), + (ApifyGoogleMapsTool, 'google_maps_search', {'query': 'q'}), + (ApifyYouTubeScraperTool, 'youtube_scrape', {'search_query': 'q'}), + (ApifyEcommerceScraperTool, 'ecommerce_scrape', {'url': 'https://example.com'}), +] + +# Tools that return the {run, items} envelope on success. +_ENVELOPE_TOOL_INVOCATIONS: list[tuple[type[_ApifyGenericTool], str, dict]] = [ + (ApifyGoogleMapsTool, 'google_maps_search', {'query': 'q'}), + (ApifyYouTubeScraperTool, 'youtube_scrape', {'search_query': 'q'}), + (ApifyEcommerceScraperTool, 'ecommerce_scrape', {'url': 'https://example.com'}), +] + + +@pytest.mark.parametrize(('tool_cls', 'helper_attr', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_search_tool_runtime_error_raises_tool_exception( + mock_tools_client: MagicMock, + tool_cls: type, + helper_attr: str, + run_kwargs: dict, +) -> None: + getattr(mock_tools_client, helper_attr).side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(tool_cls, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(**run_kwargs) + + +@pytest.mark.parametrize(('tool_cls', 'helper_attr', 'run_kwargs'), _ENVELOPE_TOOL_INVOCATIONS) +def test_search_tool_empty_dataset_returns_empty_items( + mock_tools_client: MagicMock, + tool_cls: type, + helper_attr: str, + run_kwargs: dict, +) -> None: + getattr(mock_tools_client, helper_attr).return_value = (SUCCEEDED_RUN, []) + tool = make_tool(tool_cls, mock_tools_client) + + parsed = json.loads(tool._run(**run_kwargs)) + assert parsed['items'] == [] + assert parsed['run']['status'] == 'SUCCEEDED' + + +def test_rag_web_browser_tool_empty_dataset_returns_empty_array(mock_tools_client: MagicMock) -> None: + mock_tools_client.rag_web_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyRAGWebBrowserTool, mock_tools_client) + + parsed = json.loads(tool._run(query='q')) + assert parsed['items'] == [] + assert parsed['run']['status'] == 'SUCCEEDED' + + +@pytest.mark.parametrize(('tool_cls', 'helper_attr', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_search_tool_handle_tool_error_swallows( + mock_tools_client: MagicMock, + tool_cls: type, + helper_attr: str, + run_kwargs: dict, +) -> None: + """``handle_tool_error=True`` (inherited) means ``invoke`` returns the error string.""" + getattr(mock_tools_client, helper_attr).side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(tool_cls, mock_tools_client) + + result = tool.invoke(run_kwargs) + assert 'FAILED' in result + + +@pytest.mark.parametrize(('tool_cls', 'helper_attr', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_search_tool_missing_token( + monkeypatch: pytest.MonkeyPatch, + tool_cls: type, + helper_attr: str, # noqa: ARG001 + run_kwargs: dict, # noqa: ARG001 +) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + tool_cls() + -def test_actor_tools_inherit_from_generic_base() -> None: - for tool_cls in (ApifyGoogleSearchTool, ApifyWebCrawlerTool): +def test_search_tools_inherit_from_generic_base() -> None: + for tool_cls, _, _ in _TOOL_INVOCATIONS: assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' -def test_actor_tools_have_correct_metadata() -> None: +def test_search_tools_have_correct_metadata() -> None: + cases: list[tuple[type, str]] = [ + (ApifyGoogleSearchTool, 'apify_google_search'), + (ApifyWebCrawlerTool, 'apify_web_crawler'), + (ApifyRAGWebBrowserTool, 'apify_rag_web_browser'), + (ApifyGoogleMapsTool, 'apify_google_maps'), + (ApifyYouTubeScraperTool, 'apify_youtube_scraper'), + (ApifyEcommerceScraperTool, 'apify_ecommerce_scraper'), + ] with patch.object(ApifyToolsClient, '__init__', return_value=None): - tools = [ - ApifyGoogleSearchTool(apify_token=SecretStr('dummy')), - ApifyWebCrawlerTool(apify_token=SecretStr('dummy')), - ] - - expected_names = ['apify_google_search', 'apify_web_crawler'] - for tool, expected_name in zip(tools, expected_names): - assert tool.name == expected_name - assert tool.description - assert tool.args_schema is not None - assert tool.handle_tool_error is True + for tool_cls, expected_name in cases: + tool = tool_cls(apify_token=SecretStr('dummy')) + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True def test_apify_search_tools_list() -> None: - assert set(APIFY_SEARCH_TOOLS) == {ApifyGoogleSearchTool, ApifyWebCrawlerTool} - assert len(APIFY_SEARCH_TOOLS) == 2 + assert set(APIFY_SEARCH_TOOLS) == { + ApifyGoogleSearchTool, + ApifyWebCrawlerTool, + ApifyRAGWebBrowserTool, + ApifyGoogleMapsTool, + ApifyYouTubeScraperTool, + ApifyEcommerceScraperTool, + } + assert len(APIFY_SEARCH_TOOLS) == 6 + + +# --------------------------------------------------------------------------- +# Regression: dataset items containing datetime values must not break JSON +# serialisation. The Apify client's clean=True deserialiser returns datetime +# objects for certain timestamp fields (notably Google Maps reviews and +# YouTube publishedAt), which previously raised +# ``TypeError: Object of type datetime is not JSON serializable`` inside +# ``_serialize_tool_response``. +# --------------------------------------------------------------------------- + + +# Tools that hand the client's items list straight to _serialize_tool_response, +# i.e. those most exposed to raw datetime values from the Actor's dataset. +_RETURN_LIST = 'list' +_RETURN_ENVELOPE = 'envelope' + +# Each entry: (tool_cls, client_helper_attr, run_kwargs, client_return_shape). +# Listed tools hand the client's items straight to _serialize_tool_response, +# i.e. they are most exposed to raw datetime values from the Actor's dataset. +_PASSTHROUGH_TOOL_INVOCATIONS: list[tuple[type[_ApifyGenericTool], str, dict, str]] = [ + (ApifyGoogleSearchTool, 'google_search', {'query': 'q'}, _RETURN_LIST), + (ApifyGoogleMapsTool, 'google_maps_search', {'query': 'q'}, _RETURN_ENVELOPE), + (ApifyYouTubeScraperTool, 'youtube_scrape', {'search_query': 'q'}, _RETURN_ENVELOPE), + (ApifyEcommerceScraperTool, 'ecommerce_scrape', {'url': 'https://example.com'}, _RETURN_ENVELOPE), +] + + +@pytest.mark.parametrize( + ('tool_cls', 'helper_attr', 'run_kwargs', 'client_return_shape'), + _PASSTHROUGH_TOOL_INVOCATIONS, +) +def test_search_tool_serialises_datetime_in_items( + mock_tools_client: MagicMock, + tool_cls: type, + helper_attr: str, + run_kwargs: dict, + client_return_shape: str, +) -> None: + from datetime import datetime, timezone + + timestamp = datetime(2026, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + item_with_datetime = {'id': 'item-1', 'published_at': timestamp, 'text': 'hi'} + items = [item_with_datetime] + + if client_return_shape == _RETURN_ENVELOPE: + getattr(mock_tools_client, helper_attr).return_value = (SUCCEEDED_RUN, items) + else: + getattr(mock_tools_client, helper_attr).return_value = items + tool = make_tool(tool_cls, mock_tools_client) + + result = tool._run(**run_kwargs) + parsed = json.loads(result) + + assert isinstance(parsed['items'], list) + assert len(parsed['items']) == 1 + assert parsed['items'][0]['id'] == 'item-1' + assert isinstance(parsed['items'][0]['published_at'], str) + assert '2026-01-02' in parsed['items'][0]['published_at'] diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index dcde957..b128a04 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -314,10 +314,91 @@ def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_a client.run_actor('apify/test-actor') +# --------------------------------------------------------------------------- +# _scrape_url +# --------------------------------------------------------------------------- + + +def test__scrape_url_returns_markdown_and_metadata(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + run, items, content, source = client._scrape_url('https://example.com') + assert run == SUCCEEDED_RUN + assert items + assert content == '# Hello' + assert source == 'markdown' + + +def test__scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + _, _, content, source = client._scrape_url('https://example.com') + assert content == 'Plain text content' + assert source == 'text' + + # --------------------------------------------------------------------------- # google_search # --------------------------------------------------------------------------- + +def test_google_search_input_mapping(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + { + 'organicResults': [ + {'title': 'A', 'url': 'https://a.com', 'description': 'da'}, + {'title': 'B', 'url': 'https://b.com', 'description': 'db'}, + ] + } + ] + + results = client.google_search('langchain', max_results=5, country_code='us', language_code='en') + + mock_apify_client.actor.assert_called_once_with('apify/google-search-scraper') + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == { + 'queries': 'langchain', + 'maxPagesPerQuery': 1, + 'countryCode': 'us', + 'languageCode': 'en', + } + assert len(results) == 2 + assert results[0]['title'] == 'A' + + +def test_google_search_scales_pages_to_max_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + # ~10 results/page, so 25 results needs ceil(25 / 10) == 3 pages. + client.google_search('langchain', max_results=25) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input['maxPagesPerQuery'] == 3 + assert 'resultsPerPage' not in run_input + + +def test_google_search_omits_optional_locale_params(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.google_search('langchain') + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert 'countryCode' not in run_input + assert 'languageCode' not in run_input + + +# google_search +# --------------------------------------------------------------------------- + GOOGLE_SEARCH_ITEMS: list[dict] = [ { 'organicResults': [ @@ -391,8 +472,9 @@ def test_rag_web_search_success(client: ApifyToolsClient, mock_apify_client: Mag mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = RAG_SEARCH_ITEMS - items = client.rag_web_search('test query', max_results=5) + run, items = client.rag_web_search('test query', max_results=5) + assert run == SUCCEEDED_RUN assert len(items) == 2 assert items[0]['crawledUrl'] == 'https://example.com/1' assert items[1]['text'] == 'Page 2 content' @@ -402,8 +484,9 @@ def test_rag_web_search_empty(client: ApifyToolsClient, mock_apify_client: Magic mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [] - items = client.rag_web_search('test') + run, items = client.rag_web_search('test') + assert run == SUCCEEDED_RUN assert items == [] @@ -463,3 +546,152 @@ def test_crawl_website_failed_run_raises(client: ApifyToolsClient, mock_apify_cl with pytest.raises(RuntimeError, match='run-fail'): client.crawl_website('https://example.com') + + +# --------------------------------------------------------------------------- +# google_maps_search +# --------------------------------------------------------------------------- + + +def test_google_maps_search_input_mapping(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.google_maps_search('coffee in Berlin', max_results=5, language='en') + + mock_apify_client.actor.assert_called_once_with('compass/crawler-google-places') + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == { + 'searchStringsArray': ['coffee in Berlin'], + 'maxCrawledPlacesPerSearch': 5, + 'language': 'en', + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_google_maps_search_omits_language_when_none(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.google_maps_search('parks') + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert 'language' not in run_input + + +def test_google_maps_search_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.google_maps_search('parks') + + +# --------------------------------------------------------------------------- +# youtube_scrape +# --------------------------------------------------------------------------- + + +def test_youtube_scrape_search_mode_input_mapping(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.youtube_scrape('langchain', search_type='search', max_results=7) + + mock_apify_client.actor.assert_called_once_with('streamers/youtube-scraper') + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == {'maxResults': 7, 'searchQueries': ['langchain']} + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +@pytest.mark.parametrize('search_type', ['video', 'channel']) +def test_youtube_scrape_url_modes_use_start_urls( + client: ApifyToolsClient, mock_apify_client: MagicMock, search_type: str +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.youtube_scrape('https://www.youtube.com/@apify', search_type=search_type, max_results=4) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == { + 'maxResults': 4, + 'startUrls': [{'url': 'https://www.youtube.com/@apify'}], + } + + +def test_youtube_scrape_invalid_search_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Invalid search_type'): + client.youtube_scrape('langchain', search_type='playlist') + + +def test_youtube_scrape_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.youtube_scrape('langchain') + + +# --------------------------------------------------------------------------- +# ecommerce_scrape +# --------------------------------------------------------------------------- + + +def test_ecommerce_scrape_input_mapping(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.ecommerce_scrape('https://shop.example.com/cat/123', max_results=15) + + mock_apify_client.actor.assert_called_once_with('apify/e-commerce-scraping-tool') + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == { + 'detailsUrls': [{'url': 'https://shop.example.com/cat/123'}], + 'maxProductResults': 15, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_ecommerce_scrape_category_mode_uses_listing_urls( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + client.ecommerce_scrape('https://shop.example.com/category/123', url_type='category', max_results=5) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == { + 'listingUrls': [{'url': 'https://shop.example.com/category/123'}], + 'maxProductResults': 5, + } + + +def test_ecommerce_scrape_invalid_url_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Invalid url_type'): + client.ecommerce_scrape('https://shop.example.com', url_type='listing') + + +def test_ecommerce_scrape_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.ecommerce_scrape('https://shop.example.com') + + +# --------------------------------------------------------------------------- +# rag_web_search input mapping +# --------------------------------------------------------------------------- + + +def test_rag_web_search_input_mapping(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.rag_web_search('what is langchain', max_results=4) + + mock_apify_client.actor.assert_called_once_with('apify/rag-web-browser') + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs['run_input'] + assert run_input == {'query': 'what is langchain', 'maxResults': 4} diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py index 8d5c0ed..e3372d7 100644 --- a/tests/unit_tests/test_retrievers.py +++ b/tests/unit_tests/test_retrievers.py @@ -58,6 +58,15 @@ def test_init_custom_params() -> None: assert retriever.timeout_secs == 60 +def test_deprecated_apify_api_token_alias_warns() -> None: + # ``apify_api_token`` is a runtime alias handled by a model validator, not a + # declared field, hence the call-arg ignore. + with patch.object(ApifyToolsClient, '__init__', return_value=None): + with pytest.warns(DeprecationWarning, match='apify_api_token'): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('legacy-token')) # type: ignore[call-arg] + assert retriever.apify_token == SecretStr('legacy-token') + + # --------------------------------------------------------------------------- # Sync retrieval # --------------------------------------------------------------------------- @@ -65,7 +74,7 @@ def test_init_custom_params() -> None: def test_sync_returns_documents() -> None: mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = RAG_ITEMS + mock_client.rag_web_search.return_value = ({}, RAG_ITEMS) retriever = _make_retriever(mock_client, max_results=5) docs = retriever._get_relevant_documents('test query') @@ -81,7 +90,7 @@ def test_sync_returns_documents() -> None: def test_sync_calls_helper_with_correct_args() -> None: mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = [] + mock_client.rag_web_search.return_value = ({}, []) retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) retriever._get_relevant_documents('my search') @@ -95,7 +104,7 @@ def test_sync_calls_helper_with_correct_args() -> None: def test_sync_empty_results() -> None: mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = [] + mock_client.rag_web_search.return_value = ({}, []) retriever = _make_retriever(mock_client) docs = retriever._get_relevant_documents('test') @@ -123,7 +132,7 @@ def test_sync_helper_failure_propagates() -> None: async def test_async_returns_documents() -> None: """Async path wraps the sync helper via asyncio.to_thread.""" mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = RAG_ITEMS + mock_client.rag_web_search.return_value = ({}, RAG_ITEMS) retriever = _make_retriever(mock_client, max_results=5) docs = await retriever._aget_relevant_documents('test query') @@ -137,7 +146,7 @@ async def test_async_returns_documents() -> None: @pytest.mark.asyncio async def test_async_calls_helper_with_correct_args() -> None: mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = [] + mock_client.rag_web_search.return_value = ({}, []) retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) await retriever._aget_relevant_documents('my search') @@ -152,7 +161,7 @@ async def test_async_calls_helper_with_correct_args() -> None: @pytest.mark.asyncio async def test_async_empty_results() -> None: mock_client = MagicMock(spec=ApifyToolsClient) - mock_client.rag_web_search.return_value = [] + mock_client.rag_web_search.return_value = ({}, []) retriever = _make_retriever(mock_client) docs = await retriever._aget_relevant_documents('test') diff --git a/tests/unit_tests/test_tool_response_schema.py b/tests/unit_tests/test_tool_response_schema.py new file mode 100644 index 0000000..5633671 --- /dev/null +++ b/tests/unit_tests/test_tool_response_schema.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import json +from unittest.mock import MagicMock + +import pytest + +from langchain_apify import ( + APIFY_CORE_TOOLS, + APIFY_SEARCH_TOOLS, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) +from langchain_apify._actor_tools import ( + ApifyEcommerceScraperTool, + ApifyGoogleMapsTool, + ApifyGoogleSearchTool, + ApifyRAGWebBrowserTool, + ApifyWebCrawlerTool, + ApifyYouTubeScraperTool, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool + + +def _assert_envelope_shape(payload: dict) -> None: + assert set(payload) == {'run', 'items'} + assert isinstance(payload['items'], list) + assert payload['run'] is None or isinstance(payload['run'], dict) + + +@pytest.mark.parametrize( + ('tool_cls', 'setup_method', 'run_kwargs'), + [ + (ApifyRunActorTool, 'run_actor', {'actor_id': 'apify/test'}), + (ApifyGetDatasetItemsTool, 'get_dataset_items', {'dataset_id': 'dataset-xyz'}), + (ApifyRunActorAndGetDatasetTool, 'run_actor_and_get_items', {'actor_id': 'apify/test'}), + (ApifyScrapeUrlTool, '_scrape_url', {'url': 'https://example.com'}), + (ApifyRunTaskTool, 'run_task', {'task_id': 'user/my-task'}), + (ApifyRunTaskAndGetDatasetTool, 'run_task_and_get_items', {'task_id': 'user/my-task'}), + (ApifyGoogleSearchTool, 'google_search', {'query': 'apify'}), + (ApifyWebCrawlerTool, 'crawl_website', {'url': 'https://example.com'}), + (ApifyRAGWebBrowserTool, 'rag_web_search', {'query': 'langchain'}), + (ApifyGoogleMapsTool, 'google_maps_search', {'query': 'coffee'}), + (ApifyYouTubeScraperTool, 'youtube_scrape', {'search_query': 'langchain'}), + (ApifyEcommerceScraperTool, 'ecommerce_scrape', {'url': 'https://shop.example.com/p/1'}), + ], +) +def test_all_tools_return_normalized_envelope( + mock_tools_client: MagicMock, tool_cls: type, setup_method: str, run_kwargs: dict +) -> None: + if setup_method in {'run_actor', 'run_task'}: + getattr(mock_tools_client, setup_method).return_value = SUCCEEDED_RUN + elif setup_method == 'get_dataset_items': + getattr(mock_tools_client, setup_method).return_value = SAMPLE_ITEMS + elif setup_method == '_scrape_url': + mock_tools_client._scrape_url.return_value = ( + SUCCEEDED_RUN, + [{'url': 'https://example.com', 'markdown': '# content'}], + '# content', + 'markdown', + ) + elif setup_method in {'run_actor_and_get_items', 'run_task_and_get_items'}: + getattr(mock_tools_client, setup_method).return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + elif setup_method == 'google_search': + mock_tools_client.google_search.return_value = [{'title': 'A', 'url': 'https://a', 'description': 'd'}] + elif setup_method == 'crawl_website': + mock_tools_client.crawl_website.return_value = [ + {'url': 'https://example.com', 'markdown': '# Home', 'metadata': {'title': 'Home'}} + ] + elif setup_method == 'rag_web_search': + mock_tools_client.rag_web_search.return_value = ( + SUCCEEDED_RUN, + [{'crawledUrl': 'https://example.com', 'metadata': {'title': 'Home'}, 'text': 'Home'}], + ) + elif setup_method in {'google_maps_search', 'youtube_scrape', 'ecommerce_scrape'}: + getattr(mock_tools_client, setup_method).return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + + tool = make_tool(tool_cls, mock_tools_client) + payload = json.loads(tool._run(**run_kwargs)) + _assert_envelope_shape(payload) + + +def test_empty_result_is_normalized(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_maps_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyGoogleMapsTool, mock_tools_client) + + payload = json.loads(tool._run(query='empty')) + _assert_envelope_shape(payload) + assert payload['items'] == [] + + +def test_tool_group_lists_cover_all_normalized_tools() -> None: + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + } + assert set(APIFY_SEARCH_TOOLS) == { + ApifyGoogleSearchTool, + ApifyWebCrawlerTool, + ApifyRAGWebBrowserTool, + ApifyGoogleMapsTool, + ApifyYouTubeScraperTool, + ApifyEcommerceScraperTool, + } diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index ea5405a..27fc48c 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -175,6 +175,27 @@ def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: assert parsed['items'] == [] +def test_tool_response_handles_datetime_in_items(mock_tools_client: MagicMock) -> None: + """Regression: datetime values inside ``items`` must not break serialization. + + The Apify client's ``clean=True`` deserialiser returns ``datetime`` + objects for certain timestamp fields (Google Maps reviews, YouTube + publishedAt, etc.). Without ``default=str`` in ``json.dumps``, this + raised ``TypeError: Object of type datetime is not JSON serializable`` + and the LLM saw an empty / error tool result instead of data. + """ + timestamp = datetime(2026, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + items = [{'id': 'item-1', 'published_at': timestamp, 'text': 'hi'}] + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, items) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + parsed = json.loads(tool._run(actor_id='apify/test')) + + assert parsed['items'][0]['id'] == 'item-1' + assert isinstance(parsed['items'][0]['published_at'], str) + assert '2026-01-02' in parsed['items'][0]['published_at'] + + # --------------------------------------------------------------------------- # ApifyRunActorTool # --------------------------------------------------------------------------- @@ -235,8 +256,7 @@ def test_get_dataset_items_tool_empty_returns_empty_items(mock_tools_client: Mag result = tool._run(dataset_id='dataset-empty') parsed = json.loads(result) - assert parsed['run'] is None - assert parsed['items'] == [] + assert parsed == {'run': None, 'items': []} def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: @@ -297,19 +317,24 @@ def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPa def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: - mock_tools_client.scrape_url.return_value = '# Hello World' + mock_tools_client._scrape_url.return_value = ( + SUCCEEDED_RUN, + [{'url': 'https://example.com', 'markdown': '# Hello World'}], + '# Hello World', + 'markdown', + ) tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) result = tool._run(url='https://example.com') parsed = json.loads(result) - assert parsed['run'] is None + assert parsed['run']['status'] == 'SUCCEEDED' assert parsed['items'] == [{'url': 'https://example.com', 'content': '# Hello World'}] - mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + mock_tools_client._scrape_url.assert_called_once_with('https://example.com', 120) def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: - mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + mock_tools_client._scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) with pytest.raises(ToolException, match='No content extracted'): @@ -451,12 +476,17 @@ def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) - def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: - mock_tools_client.scrape_url.return_value = '# content' + mock_tools_client._scrape_url.return_value = ( + SUCCEEDED_RUN, + [{'url': 'https://example.com', 'text': '# content'}], + '# content', + 'text', + ) tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) tool._run(url='https://example.com', timeout_secs=9999) - mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + mock_tools_client._scrape_url.assert_called_once_with('https://example.com', 30) def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: