diff --git a/src/webcap/lib/synthesis/scraper/prompts.py b/src/webcap/lib/synthesis/scraper/prompts.py index 534f26a8..21cbf32e 100644 --- a/src/webcap/lib/synthesis/scraper/prompts.py +++ b/src/webcap/lib/synthesis/scraper/prompts.py @@ -1,19 +1,20 @@ -import re -import json - from enum import Enum from typing import Dict -from pathlib import Path from webcap.utils.files import read_file, current_dir -GENERATION: Path = current_dir(__file__) / 'templates' / 'generation.md' -CORRECTION: Path = current_dir(__file__) / 'templates' / 'correction.md' -LINTING: Path = current_dir(__file__) / 'templates' / 'linting.md' +THIS_DIR = current_dir(__file__) + +API = THIS_DIR / 'templates' / 'api.md' +GENERATION = THIS_DIR / 'templates' / 'generation.md' +CORRECTION = THIS_DIR / 'templates' / 'correction.md' +LINTING = THIS_DIR / 'templates' / 'linting.md' class PromptVariable(Enum): + api = "WEBCAP_API" + url = "TARGET_URL" correction = "CORRECTION" desc = "SCRAPER_DESCRIPTION" @@ -54,8 +55,9 @@ def generation_prompt(description: str, url: str) -> str: if not isinstance(url, str): raise ValueError(f"\'url\' should be a string") + api = read_file(API) template = read_file(GENERATION) - return fill(template, url=url, desc=description) + return fill(template, api=api, url=url, desc=description) def correction_prompt(correction: str) -> str: diff --git a/src/webcap/lib/synthesis/scraper/templates/api.md b/src/webcap/lib/synthesis/scraper/templates/api.md new file mode 100644 index 00000000..6844b0ca --- /dev/null +++ b/src/webcap/lib/synthesis/scraper/templates/api.md @@ -0,0 +1,72 @@ +#### WebCap API Description + +```py +def visit(self, url: str) -> str : ... +``` +- To visit a page, returns the visited url. + +```py +def click(self, element: ElementHandle, navigate: bool = True) -> Page : ... +``` +- To click on a link/button, returns the current `Playwright` page (Do not interact with this return value directly). + + +```py +def fetch( + self, + id: str = None, + query_or_desc: str = None, + tree=None +) -> str | ElementHandle | list[str | ElementHandle] : ... +``` +- To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description, but know that it is converted to XPath internally). + + +```py +def scroll_down( + self, + step: int = 800, + max_scrolls: int = 25, + wait_ms: int = 250, + stabilize_rounds: int = 3 +) -> Page : ... +``` +- To scroll down and load dynamic content, returns the current `Playwright` page (Do not interact with this return value directly). + + +```py +def download(self, target: str | ElementHandle, name: str = None) -> Path: ... +``` +- To download a file (ex: `.pdf`) specified as an absolute url or a clickable element, returns the path to the downloaded file. + + +```py +def searchdocs( + id, + description, + docs: str | Path | List[str | Path] = None, + ignorecase: bool = False +) -> list[DocSearchResults] : ... +``` +- To search for regex matches in a downloaded file, returns a list of `DocSearchResults` (use textual description, but know that is converted to a regex internally). + + +```py +def store(self, data: Union[Any, List[Any]], db: Database = None) -> None: ... +``` +- To save a record to the database, `obj` can be a single element or a list, returns `None`. + +#### Helper Objects / Modules + +- `Playwright`: Python package `playwright` is a Python library to automate Chromium, Firefox and WebKit with a single API. + +- `ElementHandle`: Playwright objects that represents an in-page DOM element. + +- `DocSearchResults`: The dataclass implemented below +```py +@dataclass +class DocSearchResult: + name: Path + data: List[re.Match[str]] + regex: re.Pattern +``` \ No newline at end of file diff --git a/src/webcap/lib/synthesis/scraper/templates/generation.md b/src/webcap/lib/synthesis/scraper/templates/generation.md index dc9e9009..d7693d88 100644 --- a/src/webcap/lib/synthesis/scraper/templates/generation.md +++ b/src/webcap/lib/synthesis/scraper/templates/generation.md @@ -5,18 +5,8 @@ You are a Python code generator specialized in creating web scrapers that follow ## MANDATORY REQUIREMENTS -### 1. API Primitives (ONLY THESE LIB CALLS ARE ALLOWED) -You must use ONLY these primitives to interact with web pages: -- `webcap.visit(url | element)` - To visit a page, returns None; -- `webcap.click(description)` - To click on a link/button, returns None (use textual description); -- `webcap.fetch(#id, description)` - To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description only, but if it helps know that it uses xpath internally); -- `webcap.scroll_down(max_scrolls=N)` - To scroll down and load dynamic content, returns None (optional: step, wait_ms, stabilize_rounds); -- `webcap.download(url | element)` - To download a file (ex: pdf) specified as an absolute url or a clickable element, returns the path to the downloaded file; -- `webcap.searchdocs(regex_query, docs | [docs])` - To search for regex matches in a downloaded file, returns [DocSearchResults] (leave as an empty string, but if it helps know that it uses regex internally); -- `webcap.store(obj, db)` - To save a record to the database, obj can be a single element or a list. - -**Note:** Use `scroll_down()` when you need to load content that appears as you scroll (infinite scroll, lazy loading, etc.) -**Note:** Use `download()` and `searchdocs()` when the desired content is inside a file +### 1. API Primitives (ONLY WEBCAPLIB CALLS ARE ALLOWED) +`<>` ### 2. Code Structure Template Follow this EXACT structure: @@ -115,16 +105,17 @@ if __name__ == '__main__': ### 3. Generation Rules **CRITICAL RULES:** -1. **DO NOT** use xpath in fetch() calls - use natural language descriptions; +1. **DO NOT** use xpath in `fetch()` calls - use natural language descriptions; 2. **DO NOT** add any selenium, beautifulsoup, or other scraping libraries; -3. **DO NOT** modify the main() structure; +3. **DO NOT** modify the `main()` structure; 4. **DO NOT** add extra imports beyond the template; -5. **DO NOT** extract and/or set the _id, _key fields. Those are handled internally; +5. **DO NOT** extract and/or set the `_id` field. This is handled internally; 6. **DO NOT** name any new database field starting with '_'; -7. **ALWAYS** use only the primitives: visit(), click(), fetch(), scroll_down(), download(), searchdocs(), store(); +7. **ALWAYS** use only the primitives: `visit()`, `click()`, `fetch()`, `scroll_down()`, `download()`, `searchdocs()`, `store()`; 8. **ALWAYS** be sure to add unique identifier as the first argument of each fetch. 9. **ALWAYS** include cleaner functions as @staticmethod but only for numeric or very complex fields; -10. **ALWAYS** call cleaner functions when processing extracted data +10. **ALWAYS** call cleaner functions when processing extracted data; +11. **ALWAYS** be sure to set a unique `_key` field for a db entry. You can use the `add_key()` helper function to create a unique key from a combination of other entry fields. **Field Types Mapping:** - Prices → `Mapped[int]` with `clean_price()` static method @@ -161,11 +152,10 @@ def clean_price(desc: str) -> int: **Data Storage Pattern:** ```python -data = { - 'field1': self.clean_field1(raw_value1), # Use cleaner if exists - 'field2': raw_value2 # Or use raw value -} -obj = self.load_from_webcap(static=data) +obj = ObjType( + field1=self.clean_field1(raw_value1), # Use cleaner if exists + field2=raw_value2 # Or use raw value +) webcap.store(obj, db) ``` @@ -180,10 +170,16 @@ These are common patterns you may encounter. **Use your judgment** to adapt or c for page_num in range(npages): # Extract and store data - price = webcap.fetch("#X1", "price of each property on the page") - data = {'price': self.clean_price(price)} - obj = self.load_from_webcap(static=data) - webcap.store(obj, db) + prices = webcap.fetch("#X1", "price of each listing on the page") + prices = prices if isinstance(prices, list) else [prices] + + # Create price objects to store in database + data = [ + Price(price=self.clean_price(price)) + for price in prices + ] + webcap.store(data, db) + next_button = fetch("#X2","next button") webcap.click(next_button) ``` @@ -194,19 +190,25 @@ for page_num in range(npages): webcap.scroll_down(max_scrolls=npages * 5) # Extract all loaded data -price = webcap.fetch("#X1", "all prices visible on the page") -data = {'price': self.clean_price(price)} -obj = self.load_from_webcap(static=data) -webcap.store(obj, db) +prices = webcap.fetch("#X1", "all prices visible on the page") +prices = prices if isinstance(prices, list) else [prices] + +# Store extracted data +data = [ + Price(price=self.clean_price(price)) + for price in prices +] +webcap.store(data, db) ``` **Single page with no navigation:** ```python # Extract data directly -price = webcap.fetch("#X1", "price of the property") -data = {'price': self.clean_price(price)} -obj = self.load_from_webcap(static=data) -webcap.store(obj, db) +price = webcap.fetch("#X1", "price of the listing") +listing_price = ListingPrice( + price=self.clean_price(price) +) +webcap.store(listing_price, db) ``` **Navigating to detail pages:** @@ -220,9 +222,11 @@ for page_num in range(npages): price = webcap.fetch("#X1", "price") description = webcap.fetch("#X2", "full description") - data = {'price': self.clean_price(price), 'description': description} - obj = self.load_from_webcap(static=data) - webcap.store(obj, db) + listing_details = Listing_Details( + price=self.clean_price(price), + description=description + ) + webcap.store(listing_details, db) # Go back to listing page webcap.visit(url) @@ -244,7 +248,6 @@ if not isinstance(pdf_links, list): pdf_links = [pdf_links] downloaded_paths = [] try: for pdf_url in pdf_links: - print if not pdf_url: continue downloaded_paths.append(webcap.download(pdf_url)) @@ -284,6 +287,7 @@ from webcap.lib.utils import handle_sigint from webcap.lib.synthesis import configure_options, execute from sqlalchemy import Integer, String from sqlalchemy.orm import Mapped, mapped_column +from webcap.db.data.utils import add_key logger = logging.getLogger(__name__) @@ -306,7 +310,7 @@ class LisbonPropertyData: # -------------------------- class CustomScraper(Scraper): table = LisbonPropertyData - index = ('price',) + index = ('price','bedrooms') def __init__(self, url, settings: Settings): self.url = url @@ -343,18 +347,23 @@ class CustomScraper(Scraper): # Iterate through pages for page_num in range(npages): - - # Extract data from current page - price = webcap.fetch("#X1", "price of each property on the page") - bedrooms = webcap.fetch("#X2", "number of bedrooms for each property") + subtrees = webcap.fetch("#X1", "listing subtrees with property details") + data = [] + + for tree in subtrees: + # Extract data from current page + price = webcap.fetch("#X2", "price of each property on the page", tree) + bedrooms = webcap.fetch("#X3", "number of bedrooms for each property", tree) + + # Process and store + obj = LisbonPropertyData( + price=self.clean_price(price), + bedrooms=bedrooms, + ) + add_key(obj, self.index) + data.append(obj) - # Process and store - data = { - 'price': self.clean_price(price), - 'bedrooms': bedrooms - } - obj = self.load_from_webcap(static=data) - webcap.store(obj, db) + webcap.store(data, db) next_button = fetch("#X3","next button") @@ -417,7 +426,7 @@ from webcap.lib.synthesis import configure_options, execute from sqlalchemy import Integer, String from sqlalchemy.orm import Mapped, mapped_column from webcap.lib.docs import DocSearchResult -from webcap.db.data.utils import sha256 +from webcap.db.data.utils import add_key logger = logging.getLogger(__name__) @@ -435,14 +444,13 @@ class CommissionsData: text_snippet: Mapped[str] = mapped_column(String) file_path: Mapped[str] = mapped_column(String) source_url: Mapped[str] = mapped_column(String) - timestamp_utc: Mapped[str | None] = mapped_column(String, nullable=True) # -------------------------- # Scraper Class # -------------------------- class CustomScraper(Scraper): table = CommissionsData - index = ('text_snippet',) + index = ('text_snippet', 'file_path', 'source_urls') def __init__(self, url, settings: Settings): self.url = url @@ -467,17 +475,14 @@ class CustomScraper(Scraper): pattern = result.regex occurrences = len(result.data) - key = sha256(f"{filename}||{pattern}||{occurrences}") - for snippet in result.data: - output.append( - CommissionsData( - file_path=filename, - _key=key, - text_snippet=snippet.group(1), - source_url=self.url - ) + obj = CommissionsData( + text_snippet=snippet.group(1), + file_path=filename, + source_url=self.url ) + add_key(obj) + output.append(obj, self.index) return output @@ -507,14 +512,16 @@ class CustomScraper(Scraper): downloaded_paths = [] try: for pdf_url in pdf_links: - print + if not pdf_url: continue + downloaded_paths.append(webcap.download(pdf_url)) + except Exception as e: logger.debug(f"PDF links fetch failed on landing page: {e}") - docs = webcap.searchdocs("#R2", None, downloaded_paths) + docs = webcap.searchdocs("#R2", "Bank account commission value", downloaded_paths) if not isinstance(docs, list): docs = [docs]