Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions src/webcap/lib/synthesis/scraper/prompts.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import re
import json

from enum import Enum
from typing import Dict
from pathlib import Path

from webcap.utils.files import read_file, current_dir

GENERATION: Path = current_dir(__file__) / 'templates' / 'generation.md'
CORRECTION: Path = current_dir(__file__) / 'templates' / 'correction.md'
LINTING: Path = current_dir(__file__) / 'templates' / 'linting.md'
THIS_DIR = current_dir(__file__)

API = THIS_DIR / 'templates' / 'api.md'
GENERATION = THIS_DIR / 'templates' / 'generation.md'
CORRECTION = THIS_DIR / 'templates' / 'correction.md'
LINTING = THIS_DIR / 'templates' / 'linting.md'


class PromptVariable(Enum):

api = "WEBCAP_API"

url = "TARGET_URL"
correction = "CORRECTION"
desc = "SCRAPER_DESCRIPTION"
Expand Down Expand Up @@ -54,8 +55,9 @@ def generation_prompt(description: str, url: str) -> str:
if not isinstance(url, str):
raise ValueError(f"\'url\' should be a string")

api = read_file(API)
template = read_file(GENERATION)
return fill(template, url=url, desc=description)
return fill(template, api=api, url=url, desc=description)


def correction_prompt(correction: str) -> str:
Expand Down
72 changes: 72 additions & 0 deletions src/webcap/lib/synthesis/scraper/templates/api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#### WebCap API Description

```py
def visit(self, url: str) -> str : ...
```
- To visit a page, returns the visited url.

```py
def click(self, element: ElementHandle, navigate: bool = True) -> Page : ...
```
- To click on a link/button, returns the current `Playwright` page (Do not interact with this return value directly).


```py
def fetch(
self,
id: str = None,
query_or_desc: str = None,
tree=None
) -> str | ElementHandle | list[str | ElementHandle] : ...
```
- To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description, but know that it is converted to XPath internally).


```py
def scroll_down(
self,
step: int = 800,
max_scrolls: int = 25,
wait_ms: int = 250,
stabilize_rounds: int = 3
) -> Page : ...
```
- To scroll down and load dynamic content, returns the current `Playwright` page (Do not interact with this return value directly).


```py
def download(self, target: str | ElementHandle, name: str = None) -> Path: ...
```
- To download a file (ex: `.pdf`) specified as an absolute url or a clickable element, returns the path to the downloaded file.


```py
def searchdocs(
id,
description,
docs: str | Path | List[str | Path] = None,
ignorecase: bool = False
) -> list[DocSearchResults] : ...
```
- To search for regex matches in a downloaded file, returns a list of `DocSearchResults` (use textual description, but know that is converted to a regex internally).


```py
def store(self, data: Union[Any, List[Any]], db: Database = None) -> None: ...
```
- To save a record to the database, `obj` can be a single element or a list, returns `None`.

#### Helper Objects / Modules

- `Playwright`: Python package `playwright` is a Python library to automate Chromium, Firefox and WebKit with a single API.

- `ElementHandle`: Playwright objects that represents an in-page DOM element.

- `DocSearchResults`: The dataclass implemented below
```py
@dataclass
class DocSearchResult:
name: Path
data: List[re.Match[str]]
regex: re.Pattern
```
135 changes: 71 additions & 64 deletions src/webcap/lib/synthesis/scraper/templates/generation.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,8 @@ You are a Python code generator specialized in creating web scrapers that follow

## MANDATORY REQUIREMENTS

### 1. API Primitives (ONLY THESE LIB CALLS ARE ALLOWED)
You must use ONLY these primitives to interact with web pages:
- `webcap.visit(url | element)` - To visit a page, returns None;
- `webcap.click(description)` - To click on a link/button, returns None (use textual description);
- `webcap.fetch(#id, description)` - To extract data, returns a list of values when multiple are extracted or a single value otherwise (use textual description only, but if it helps know that it uses xpath internally);
- `webcap.scroll_down(max_scrolls=N)` - To scroll down and load dynamic content, returns None (optional: step, wait_ms, stabilize_rounds);
- `webcap.download(url | element)` - To download a file (ex: pdf) specified as an absolute url or a clickable element, returns the path to the downloaded file;
- `webcap.searchdocs(regex_query, docs | [docs])` - To search for regex matches in a downloaded file, returns [DocSearchResults] (leave as an empty string, but if it helps know that it uses regex internally);
- `webcap.store(obj, db)` - To save a record to the database, obj can be a single element or a list.

**Note:** Use `scroll_down()` when you need to load content that appears as you scroll (infinite scroll, lazy loading, etc.)
**Note:** Use `download()` and `searchdocs()` when the desired content is inside a file
### 1. API Primitives (ONLY WEBCAPLIB CALLS ARE ALLOWED)
`<<WEBCAP_API>>`

### 2. Code Structure Template
Follow this EXACT structure:
Expand Down Expand Up @@ -115,16 +105,17 @@ if __name__ == '__main__':
### 3. Generation Rules

**CRITICAL RULES:**
1. **DO NOT** use xpath in fetch() calls - use natural language descriptions;
1. **DO NOT** use xpath in `fetch()` calls - use natural language descriptions;
2. **DO NOT** add any selenium, beautifulsoup, or other scraping libraries;
3. **DO NOT** modify the main() structure;
3. **DO NOT** modify the `main()` structure;
4. **DO NOT** add extra imports beyond the template;
5. **DO NOT** extract and/or set the _id, _key fields. Those are handled internally;
5. **DO NOT** extract and/or set the `_id` field. This is handled internally;
6. **DO NOT** name any new database field starting with '_';
7. **ALWAYS** use only the primitives: visit(), click(), fetch(), scroll_down(), download(), searchdocs(), store();
7. **ALWAYS** use only the primitives: `visit()`, `click()`, `fetch()`, `scroll_down()`, `download()`, `searchdocs()`, `store()`;
8. **ALWAYS** be sure to add unique identifier as the first argument of each fetch.
9. **ALWAYS** include cleaner functions as @staticmethod but only for numeric or very complex fields;
10. **ALWAYS** call cleaner functions when processing extracted data
10. **ALWAYS** call cleaner functions when processing extracted data;
11. **ALWAYS** be sure to set a unique `_key` field for a db entry. You can use the `add_key()` helper function to create a unique key from a combination of other entry fields.

**Field Types Mapping:**
- Prices → `Mapped[int]` with `clean_price()` static method
Expand Down Expand Up @@ -161,11 +152,10 @@ def clean_price(desc: str) -> int:

**Data Storage Pattern:**
```python
data = {
'field1': self.clean_field1(raw_value1), # Use cleaner if exists
'field2': raw_value2 # Or use raw value
}
obj = self.load_from_webcap(static=data)
obj = ObjType(
field1=self.clean_field1(raw_value1), # Use cleaner if exists
field2=raw_value2 # Or use raw value
)
webcap.store(obj, db)
```

Expand All @@ -180,10 +170,16 @@ These are common patterns you may encounter. **Use your judgment** to adapt or c
for page_num in range(npages):

# Extract and store data
price = webcap.fetch("#X1", "price of each property on the page")
data = {'price': self.clean_price(price)}
obj = self.load_from_webcap(static=data)
webcap.store(obj, db)
prices = webcap.fetch("#X1", "price of each listing on the page")
prices = prices if isinstance(prices, list) else [prices]

# Create price objects to store in database
data = [
Price(price=self.clean_price(price))
for price in prices
]
webcap.store(data, db)

next_button = fetch("#X2","next button")
webcap.click(next_button)
```
Expand All @@ -194,19 +190,25 @@ for page_num in range(npages):
webcap.scroll_down(max_scrolls=npages * 5)

# Extract all loaded data
price = webcap.fetch("#X1", "all prices visible on the page")
data = {'price': self.clean_price(price)}
obj = self.load_from_webcap(static=data)
webcap.store(obj, db)
prices = webcap.fetch("#X1", "all prices visible on the page")
prices = prices if isinstance(prices, list) else [prices]

# Store extracted data
data = [
Price(price=self.clean_price(price))
for price in prices
]
webcap.store(data, db)
```

**Single page with no navigation:**
```python
# Extract data directly
price = webcap.fetch("#X1", "price of the property")
data = {'price': self.clean_price(price)}
obj = self.load_from_webcap(static=data)
webcap.store(obj, db)
price = webcap.fetch("#X1", "price of the listing")
listing_price = ListingPrice(
price=self.clean_price(price)
)
webcap.store(listing_price, db)
```

**Navigating to detail pages:**
Expand All @@ -220,9 +222,11 @@ for page_num in range(npages):
price = webcap.fetch("#X1", "price")
description = webcap.fetch("#X2", "full description")

data = {'price': self.clean_price(price), 'description': description}
obj = self.load_from_webcap(static=data)
webcap.store(obj, db)
listing_details = Listing_Details(
price=self.clean_price(price),
description=description
)
webcap.store(listing_details, db)

# Go back to listing page
webcap.visit(url)
Expand All @@ -244,7 +248,6 @@ if not isinstance(pdf_links, list): pdf_links = [pdf_links]
downloaded_paths = []
try:
for pdf_url in pdf_links:
print
if not pdf_url:
continue
downloaded_paths.append(webcap.download(pdf_url))
Expand Down Expand Up @@ -284,6 +287,7 @@ from webcap.lib.utils import handle_sigint
from webcap.lib.synthesis import configure_options, execute
from sqlalchemy import Integer, String
from sqlalchemy.orm import Mapped, mapped_column
from webcap.db.data.utils import add_key

logger = logging.getLogger(__name__)

Expand All @@ -306,7 +310,7 @@ class LisbonPropertyData:
# --------------------------
class CustomScraper(Scraper):
table = LisbonPropertyData
index = ('price',)
index = ('price','bedrooms')

def __init__(self, url, settings: Settings):
self.url = url
Expand Down Expand Up @@ -343,18 +347,23 @@ class CustomScraper(Scraper):

# Iterate through pages
for page_num in range(npages):

# Extract data from current page
price = webcap.fetch("#X1", "price of each property on the page")
bedrooms = webcap.fetch("#X2", "number of bedrooms for each property")
subtrees = webcap.fetch("#X1", "listing subtrees with property details")
data = []

for tree in subtrees:
# Extract data from current page
price = webcap.fetch("#X2", "price of each property on the page", tree)
bedrooms = webcap.fetch("#X3", "number of bedrooms for each property", tree)

# Process and store
obj = LisbonPropertyData(
price=self.clean_price(price),
bedrooms=bedrooms,
)
add_key(obj, self.index)
data.append(obj)

# Process and store
data = {
'price': self.clean_price(price),
'bedrooms': bedrooms
}
obj = self.load_from_webcap(static=data)
webcap.store(obj, db)
webcap.store(data, db)

next_button = fetch("#X3","next button")

Expand Down Expand Up @@ -417,7 +426,7 @@ from webcap.lib.synthesis import configure_options, execute
from sqlalchemy import Integer, String
from sqlalchemy.orm import Mapped, mapped_column
from webcap.lib.docs import DocSearchResult
from webcap.db.data.utils import sha256
from webcap.db.data.utils import add_key

logger = logging.getLogger(__name__)

Expand All @@ -435,14 +444,13 @@ class CommissionsData:
text_snippet: Mapped[str] = mapped_column(String)
file_path: Mapped[str] = mapped_column(String)
source_url: Mapped[str] = mapped_column(String)
timestamp_utc: Mapped[str | None] = mapped_column(String, nullable=True)

# --------------------------
# Scraper Class
# --------------------------
class CustomScraper(Scraper):
table = CommissionsData
index = ('text_snippet',)
index = ('text_snippet', 'file_path', 'source_urls')

def __init__(self, url, settings: Settings):
self.url = url
Expand All @@ -467,17 +475,14 @@ class CustomScraper(Scraper):
pattern = result.regex
occurrences = len(result.data)

key = sha256(f"{filename}||{pattern}||{occurrences}")

for snippet in result.data:
output.append(
CommissionsData(
file_path=filename,
_key=key,
text_snippet=snippet.group(1),
source_url=self.url
)
obj = CommissionsData(
text_snippet=snippet.group(1),
file_path=filename,
source_url=self.url
)
add_key(obj)
output.append(obj, self.index)

return output

Expand Down Expand Up @@ -507,14 +512,16 @@ class CustomScraper(Scraper):
downloaded_paths = []
try:
for pdf_url in pdf_links:
print

if not pdf_url:
continue

downloaded_paths.append(webcap.download(pdf_url))

except Exception as e:
logger.debug(f"PDF links fetch failed on landing page: {e}")

docs = webcap.searchdocs("#R2", None, downloaded_paths)
docs = webcap.searchdocs("#R2", "Bank account commission value", downloaded_paths)

if not isinstance(docs, list):
docs = [docs]
Expand Down
Loading