Skip to content

Commit 9bf283b

Browse files
committed
Added materials for Docling vs LlamaParse tutorial
1 parent a568540 commit 9bf283b

6 files changed

Lines changed: 154 additions & 0 deletions

File tree

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Parse a PDF with Docling and print Markdown output."""
2+
3+
from pathlib import Path
4+
5+
from docling.document_converter import DocumentConverter
6+
7+
PDF_PATH = Path("sample_report.pdf")
8+
9+
10+
def main() -> None:
11+
converter = DocumentConverter()
12+
result = converter.convert(PDF_PATH)
13+
14+
markdown = result.document.export_to_markdown()
15+
print(markdown[:3000])
16+
print("\n---\n")
17+
print(f"Pages parsed: {len(result.document.pages)}")
18+
print(f"Tables found: {len(result.document.tables)}")
19+
20+
21+
if __name__ == "__main__":
22+
main()
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Export Docling parse results to Markdown, JSON, HTML, and pandas DataFrames."""
2+
3+
import json
4+
from pathlib import Path
5+
6+
from docling.document_converter import DocumentConverter
7+
8+
PDF_PATH = Path("sample_report.pdf")
9+
10+
11+
def main() -> None:
12+
converter = DocumentConverter()
13+
document = converter.convert(PDF_PATH).document
14+
15+
markdown = document.export_to_markdown()
16+
Path("output_docling.md").write_text(markdown, encoding="utf-8")
17+
18+
payload = document.export_to_dict()
19+
Path("output_docling.json").write_text(
20+
json.dumps(payload, indent=2),
21+
encoding="utf-8",
22+
)
23+
24+
html = document.export_to_html()
25+
Path("output_docling.html").write_text(html, encoding="utf-8")
26+
27+
for index, table in enumerate(document.tables):
28+
frame = table.export_to_dataframe(doc=document)
29+
print(f"Table {index} shape: {frame.shape}")
30+
print(frame.head(), end="\n\n")
31+
32+
33+
if __name__ == "__main__":
34+
main()
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Parse a PDF with LlamaParse (llama-cloud SDK) and print Markdown output."""
2+
3+
import os
4+
from pathlib import Path
5+
6+
from llama_cloud import LlamaCloud
7+
8+
PDF_PATH = Path("sample_report.pdf")
9+
10+
11+
def main() -> None:
12+
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])
13+
14+
uploaded = client.files.create(file=PDF_PATH, purpose="parse")
15+
result = client.parsing.parse(
16+
file_id=uploaded.id,
17+
tier="agentic",
18+
version="latest",
19+
expand=["markdown"],
20+
)
21+
22+
pages = ""
23+
for page in result.markdown.pages:
24+
pages += page.markdown
25+
pages += "\n---\n"
26+
27+
print(pages[:3000])
28+
print(f"Pages parsed: {len(result.markdown.pages)}")
29+
30+
if __name__ == "__main__":
31+
main()
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Export LlamaParse results to Markdown, Text, and schema-driven JSON."""
2+
3+
import json
4+
import os
5+
from pathlib import Path
6+
7+
from llama_cloud import LlamaCloud
8+
from pydantic import BaseModel, Field
9+
10+
PDF_PATH = Path("sample_report.pdf")
11+
12+
13+
class RevenueRow(BaseModel):
14+
quarter: str = Field(description="Fiscal quarter label, e.g. Q1 2024")
15+
revenue_millions: float = Field(description="Revenue in millions of USD")
16+
growth_percent: float | None = Field(
17+
default=None,
18+
description="Year-over-year growth percentage if stated",
19+
)
20+
21+
22+
class RevenueTable(BaseModel):
23+
rows: list[RevenueRow] = Field(description="One row per quarter in the table")
24+
25+
26+
def main() -> None:
27+
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])
28+
29+
uploaded = client.files.create(file=PDF_PATH, purpose="parse")
30+
31+
parsed = client.parsing.parse(
32+
file_id=uploaded.id,
33+
tier="agentic",
34+
version="latest",
35+
expand=["markdown", "text"],
36+
)
37+
38+
markdown_pages = "\n\n".join(page.markdown for page in parsed.markdown.pages)
39+
Path("output_llamaparse.md").write_text(markdown_pages, encoding="utf-8")
40+
41+
if parsed.text and parsed.text.pages:
42+
text_pages = "\n".join(page.text for page in parsed.text.pages)
43+
Path("output_llamaparse.text").write_text(text_pages, encoding="utf-8")
44+
45+
extract_file = client.files.create(file=PDF_PATH, purpose="extract")
46+
job = client.extract.run(
47+
file_input=extract_file.id,
48+
configuration={
49+
"data_schema": RevenueTable.model_json_schema(),
50+
"extraction_target": "per_doc",
51+
"tier": "agentic",
52+
},
53+
)
54+
55+
Path("output_llamaparse.json").write_text(
56+
json.dumps(job.extract_result, indent=2),
57+
encoding="utf-8",
58+
)
59+
print(json.dumps(job.extract_result, indent=2))
60+
61+
62+
if __name__ == "__main__":
63+
main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
docling==2.102.2
2+
llama-cloud>=2.9.0
3+
pandas>=2.0.0
4+
pydantic>=2.0.0
155 KB
Binary file not shown.

0 commit comments

Comments
 (0)