formalsec · frediramos · Apr 14, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 13, 2026
diff --git a/src/webcap/lib/synthesis/scraper/prompts.py b/src/webcap/lib/synthesis/scraper/prompts.py
@@ -1,19 +1,20 @@
-import re
-import json
-
 from enum import Enum
 from typing import Dict
-from pathlib import Path
 
 from webcap.utils.files import read_file, current_dir
 
-GENERATION: Path = current_dir(__file__) / 'templates' / 'generation.md'
-CORRECTION: Path = current_dir(__file__) / 'templates' / 'correction.md'
-LINTING: Path = current_dir(__file__) / 'templates' / 'linting.md'
+THIS_DIR = current_dir(__file__)
+
+API = THIS_DIR / 'templates' / 'api.md'
+GENERATION = THIS_DIR / 'templates' / 'generation.md'
+CORRECTION = THIS_DIR / 'templates' / 'correction.md'
+LINTING = THIS_DIR / 'templates' / 'linting.md'
 
 
 class PromptVariable(Enum):
 
+    api = "WEBCAP_API"
+
     url = "TARGET_URL"
     correction = "CORRECTION"
     desc = "SCRAPER_DESCRIPTION"
@@ -54,8 +55,9 @@ def generation_prompt(description: str, url: str) -> str:
     if not isinstance(url, str):
         raise ValueError(f"\'url\' should be a string")
 
+    api = read_file(API)
     template = read_file(GENERATION)
-    return fill(template, url=url, desc=description)
+    return fill(template, api=api, url=url, desc=description)
 
 
 def correction_prompt(correction: str) -> str:

diff --git a/src/webcap/lib/synthesis/scraper/templates/api.md b/src/webcap/lib/synthesis/scraper/templates/api.md
@@ -0,0 +1,72 @@
+#### WebCap API Description
+
+```py
+def visit(self, url: str) -> str : ...
+```
+- To visit a page, returns the visited url.
+
+```py
+def click(self, element: ElementHandle, navigate: bool = True) -> Page : ...
+```
+- To click on a link/button, returns the current `Playwright` page (Do not interact with this return value directly).
+
+
+```py 
+def fetch(
+    self,
+    id: str = None,
+    query_or_desc: str = None,
+    tree=None
+) -> str | ElementHandle | list[str | ElementHandle] : ...
+``` 
+- To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description, but know that it is converted to XPath internally).
+
+
+```py
+def scroll_down(
+    self,
+    step: int = 800,
+    max_scrolls: int = 25,
+    wait_ms: int = 250,
+    stabilize_rounds: int = 3
+) -> Page : ...
+```
+- To scroll down and load dynamic content, returns the current `Playwright` page (Do not interact with this return value directly).
+
+
+```py
+def download(self, target: str | ElementHandle, name: str = None) -> Path: ...
+```
+- To download a file (ex: `.pdf`) specified as an absolute url or a clickable element, returns the path to the downloaded file.
+
+
+```py
+def searchdocs(
+    id,
+    description,
+    docs: str | Path | List[str | Path] = None,
+    ignorecase: bool = False
+) -> list[DocSearchResults] : ...
+```
+- To search for regex matches in a downloaded file, returns a list of `DocSearchResults` (use textual description, but know that is converted to a regex internally).
+
+
+```py
+def store(self, data: Union[Any, List[Any]], db: Database = None) -> None: ...
+```
+- To save a record to the database, `obj` can be a single element or a list, returns `None`.
+
+#### Helper Objects / Modules
+
+- `Playwright`: Python package `playwright` is a Python library to automate Chromium, Firefox and WebKit with a single API.
+
+- `ElementHandle`: Playwright objects that represents an in-page DOM element.
+
+- `DocSearchResults`: The dataclass implemented below
+```py
+@dataclass
+class DocSearchResult:
+    name: Path
+    data: List[re.Match[str]]
+    regex: re.Pattern
+``` 
diff --git a/src/webcap/lib/synthesis/scraper/templates/generation.md b/src/webcap/lib/synthesis/scraper/templates/generation.md
@@ -5,18 +5,8 @@ You are a Python code generator specialized in creating web scrapers that follow
 
 ## MANDATORY REQUIREMENTS
 
-### 1. API Primitives (ONLY THESE LIB CALLS ARE ALLOWED)
-You must use ONLY these primitives to interact with web pages:
-- `webcap.visit(url | element)` - To visit a page, returns None;
-- `webcap.click(description)` - To click on a link/button, returns None (use textual description);
-- `webcap.fetch(#id, description)` - To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description only, but if it helps know that it uses xpath internally);
-- `webcap.scroll_down(max_scrolls=N)` - To scroll down and load dynamic content, returns None (optional: step, wait_ms, stabilize_rounds);
-- `webcap.download(url | element)` - To download a file (ex: pdf) specified as an absolute url or a clickable element, returns the path to the downloaded file;
-- `webcap.searchdocs(regex_query, docs | [docs])` - To search for regex matches in a downloaded file, returns [DocSearchResults] (leave as an empty string, but if it helps know that it uses regex internally);
-- `webcap.store(obj, db)` - To save a record to the database, obj can be a single element or a list.
-
-**Note:** Use `scroll_down()` when you need to load content that appears as you scroll (infinite scroll, lazy loading, etc.)
-**Note:** Use `download()` and `searchdocs()` when the desired content is inside a file
+### 1. API Primitives (ONLY WEBCAPLIB CALLS ARE ALLOWED)
+`<<WEBCAP_API>>`
 
 ### 2. Code Structure Template
 Follow this EXACT structure:
@@ -115,16 +105,17 @@ if __name__ == '__main__':
 ### 3. Generation Rules
 
 **CRITICAL RULES:**
-1. **DO NOT** use xpath in fetch() calls - use natural language descriptions;
+1. **DO NOT** use xpath in `fetch()` calls - use natural language descriptions;
 2. **DO NOT** add any selenium, beautifulsoup, or other scraping libraries;
-3. **DO NOT** modify the main() structure;
+3. **DO NOT** modify the `main()` structure;
 4. **DO NOT** add extra imports beyond the template;
-5. **DO NOT** extract and/or set the _id, _key fields. Those are handled internally;
+5. **DO NOT** extract and/or set the `_id` field. This is handled internally;
 6. **DO NOT** name any new database field starting with '_';
-7. **ALWAYS** use only the primitives: visit(), click(), fetch(), scroll_down(), download(), searchdocs(), store();
+7. **ALWAYS** use only the primitives: `visit()`, `click()`, `fetch()`, `scroll_down()`, `download()`, `searchdocs()`, `store()`;
 8. **ALWAYS** be sure to add unique identifier as the first argument of each fetch.
 9. **ALWAYS** include cleaner functions as @staticmethod but only for numeric or very complex fields;
-10. **ALWAYS** call cleaner functions when processing extracted data
+10. **ALWAYS** call cleaner functions when processing extracted data;
+11. **ALWAYS** be sure to set a unique `_key` field for a db entry. You can use the `add_key()` helper function to create a unique key from a combination of other entry fields.
 
 **Field Types Mapping:**
 - Prices → `Mapped[int]` with `clean_price()` static method
@@ -161,11 +152,10 @@ def clean_price(desc: str) -> int:
 
 **Data Storage Pattern:**
 ```python
-data = {
-    'field1': self.clean_field1(raw_value1),  # Use cleaner if exists
-    'field2': raw_value2                      # Or use raw value
-}
-obj = self.load_from_webcap(static=data)
+obj = ObjType(
+    field1=self.clean_field1(raw_value1), # Use cleaner if exists
+    field2=raw_value2                     # Or use raw value
+)
 webcap.store(obj, db)
 ```
 
@@ -180,10 +170,16 @@ These are common patterns you may encounter. **Use your judgment** to adapt or c
 for page_num in range(npages):
 
     # Extract and store data
-    price = webcap.fetch("#X1", "price of each property on the page")
-    data = {'price': self.clean_price(price)}
-    obj = self.load_from_webcap(static=data)
-    webcap.store(obj, db)
+    prices = webcap.fetch("#X1", "price of each listing on the page")
+    prices = prices if isinstance(prices, list) else [prices]
+
+    # Create price objects to store in database
+    data = [
+        Price(price=self.clean_price(price)) 
+            for price in prices
+    ]
+    webcap.store(data, db)
+
     next_button = fetch("#X2","next button")
     webcap.click(next_button)
 ```
@@ -194,19 +190,25 @@ for page_num in range(npages):
 webcap.scroll_down(max_scrolls=npages * 5)
 
 # Extract all loaded data
-price = webcap.fetch("#X1", "all prices visible on the page")
-data = {'price': self.clean_price(price)}
-obj = self.load_from_webcap(static=data)
-webcap.store(obj, db)
+prices = webcap.fetch("#X1", "all prices visible on the page")
+prices = prices if isinstance(prices, list) else [prices]
+
+# Store extracted data
+data = [
+    Price(price=self.clean_price(price)) 
+        for price in prices
+]
+webcap.store(data, db)
 ```
 
 **Single page with no navigation:**
 ```python
 # Extract data directly
-price = webcap.fetch("#X1", "price of the property")
-data = {'price': self.clean_price(price)}
-obj = self.load_from_webcap(static=data)
-webcap.store(obj, db)
+price = webcap.fetch("#X1", "price of the listing")
+listing_price = ListingPrice(
+    price=self.clean_price(price)
+)
+webcap.store(listing_price, db)
 ```
 
 **Navigating to detail pages:**
@@ -220,9 +222,11 @@ for page_num in range(npages):
     price = webcap.fetch("#X1", "price")
     description = webcap.fetch("#X2", "full description")
 
-    data = {'price': self.clean_price(price), 'description': description}
-    obj = self.load_from_webcap(static=data)
-    webcap.store(obj, db)
+    listing_details = Listing_Details(
+        price=self.clean_price(price),
+        description=description
+    )
+    webcap.store(listing_details, db)
 
     # Go back to listing page
     webcap.visit(url)
@@ -244,7 +248,6 @@ if not isinstance(pdf_links, list): pdf_links = [pdf_links]
 downloaded_paths = []
 try:
     for pdf_url in pdf_links:
-        print
         if not pdf_url:
             continue
         downloaded_paths.append(webcap.download(pdf_url))
@@ -284,6 +287,7 @@ from webcap.lib.utils import handle_sigint
 from webcap.lib.synthesis import configure_options, execute
 from sqlalchemy import Integer, String
 from sqlalchemy.orm import Mapped, mapped_column
+from webcap.db.data.utils import add_key
 
 logger = logging.getLogger(__name__)
 
@@ -306,7 +310,7 @@ class LisbonPropertyData:
 # --------------------------
 class CustomScraper(Scraper):
     table = LisbonPropertyData
-    index = ('price',)
+    index = ('price','bedrooms')
 
     def __init__(self, url, settings: Settings):
         self.url = url
@@ -343,18 +347,23 @@ class CustomScraper(Scraper):
 
             # Iterate through pages
             for page_num in range(npages):
-
-                # Extract data from current page
-                price = webcap.fetch("#X1", "price of each property on the page")
-                bedrooms = webcap.fetch("#X2", "number of bedrooms for each property")
+                subtrees = webcap.fetch("#X1", "listing subtrees with property details")
+                data = []
+
+                for tree in subtrees:
+                    # Extract data from current page
+                    price = webcap.fetch("#X2", "price of each property on the page", tree)
+                    bedrooms = webcap.fetch("#X3", "number of bedrooms for each property", tree)
+
+                    # Process and store
+                    obj = LisbonPropertyData(
+                            price=self.clean_price(price),
+                            bedrooms=bedrooms,
+                    )
+                    add_key(obj, self.index)
+                    data.append(obj)
 
-                # Process and store
-                data = {
-                    'price': self.clean_price(price),
-                    'bedrooms': bedrooms
-                }
-                obj = self.load_from_webcap(static=data)
-                webcap.store(obj, db)
+                webcap.store(data, db)
 
                 next_button = fetch("#X3","next button")
 
@@ -417,7 +426,7 @@ from webcap.lib.synthesis import configure_options, execute
 from sqlalchemy import Integer, String
 from sqlalchemy.orm import Mapped, mapped_column
 from webcap.lib.docs import DocSearchResult
-from webcap.db.data.utils import sha256
+from webcap.db.data.utils import add_key
 
 logger = logging.getLogger(__name__)
 
@@ -435,14 +444,13 @@ class CommissionsData:
     text_snippet: Mapped[str] = mapped_column(String)
     file_path: Mapped[str] = mapped_column(String)
     source_url: Mapped[str] = mapped_column(String)
-    timestamp_utc: Mapped[str | None] = mapped_column(String, nullable=True)
 
 # --------------------------
 # Scraper Class
 # --------------------------
 class CustomScraper(Scraper):
     table = CommissionsData
-    index = ('text_snippet',)
+    index = ('text_snippet', 'file_path', 'source_urls')
 
     def __init__(self, url, settings: Settings):
         self.url = url
@@ -467,17 +475,14 @@ class CustomScraper(Scraper):
             pattern = result.regex
             occurrences = len(result.data)
 
-            key = sha256(f"{filename}||{pattern}||{occurrences}")
-
             for snippet in result.data:
-                output.append(
-                    CommissionsData(
-                        file_path=filename,
-                        _key=key,
-                        text_snippet=snippet.group(1),
-                        source_url=self.url
-                    )
+                obj = CommissionsData(
+                    text_snippet=snippet.group(1),
+                    file_path=filename,
+                    source_url=self.url
                 )
+                add_key(obj)
+                output.append(obj, self.index)
 
         return output
 
@@ -507,14 +512,16 @@ class CustomScraper(Scraper):
             downloaded_paths = []
             try:
                 for pdf_url in pdf_links:
-                    print
+
                     if not pdf_url:
                         continue
+
                     downloaded_paths.append(webcap.download(pdf_url))
+
             except Exception as e:
                 logger.debug(f"PDF links fetch failed on landing page: {e}")
 
-            docs = webcap.searchdocs("#R2", None, downloaded_paths)
+            docs = webcap.searchdocs("#R2", "Bank account commission value", downloaded_paths)
 
             if not isinstance(docs, list):
                 docs = [docs]