From aac1d16dc3b39abd83f468d39254e1c81d76c871 Mon Sep 17 00:00:00 2001 From: Tharick Jairam Date: Fri, 3 Oct 2025 14:44:46 -0400 Subject: [PATCH 1/4] Added import and export of documents and knodes --- deepdoc_client_action/CHANGELOG.md | 5 +- deepdoc_client_action/app/app.py | 101 ++++++++++++++++++ .../deepdoc_client_action.jac | 66 +++++++++--- deepdoc_client_action/export_documents.jac | 40 +++++++ deepdoc_client_action/import_documents.jac | 62 +++++++++++ deepdoc_client_action/info.yaml | 2 +- deepdoc_client_action/lib.jac | 4 +- 7 files changed, 265 insertions(+), 15 deletions(-) create mode 100644 deepdoc_client_action/export_documents.jac create mode 100644 deepdoc_client_action/import_documents.jac diff --git a/deepdoc_client_action/CHANGELOG.md b/deepdoc_client_action/CHANGELOG.md index aeab5ad..c02cd06 100644 --- a/deepdoc_client_action/CHANGELOG.md +++ b/deepdoc_client_action/CHANGELOG.md @@ -74,4 +74,7 @@ - Updated docs # 0.1.11 -- Increase timeout and add logs \ No newline at end of file +- Increase timeout and add logs + +# 0.1.12 +- Added import and export of documents and knodes \ No newline at end of file diff --git a/deepdoc_client_action/app/app.py b/deepdoc_client_action/app/app.py index 32dd5bf..7191b16 100644 --- a/deepdoc_client_action/app/app.py +++ b/deepdoc_client_action/app/app.py @@ -7,6 +7,7 @@ from typing import Dict import streamlit as st +import yaml from jvclient.lib.utils import call_api, get_reports_payload from jvclient.lib.widgets import app_header, app_update_action from streamlit_router import StreamlitRouter @@ -271,6 +272,106 @@ def get_status_badge(status: str) -> str: color = color_map.get(status, "gray") return f"{status}" + with st.expander("Export document", True): + # Fetch documents with pagination parameters + with_embeddings = st.toggle( + "Export with Embeddings", value=True, key=f"{model_key}_with_embeddings" + ) + result = call_api( + endpoint="action/walker/deepdoc_client_action/export_documents", + json_data={ + "agent_id": agent_id, + "reporting": True, + "with_embeddings": with_embeddings, + }, + timeout=120, + ) + + if result and result.status_code == 200: + payload = get_reports_payload(result) + if payload: + st.download_button( + label="Download Documents", + data=json.dumps(payload, indent=2, ensure_ascii=False), + file_name="deepdoc_documents.json", + mime="application/json", + ) + else: + st.error("No job ID returned from the API. Please try again.") + + with st.expander("Import document", True): + knode_source = st.radio( + "Choose data source:", + ("Text input", "Upload file"), + key=f"{model_key}_knode_source", + ) + + purge_collection = st.toggle( + "Purge Collection", + value=False, + key=f"{model_key}_purge_collection", + ) + + data_to_import = "" + if knode_source == "Text input": + data_to_import = st.text_area( + "Document in YAML or JSON", + value="", + height=170, + key=f"{model_key}_knode_data", + ) + + uploaded_file = None + if knode_source == "Upload file": + uploaded_file = st.file_uploader( + "Upload file (YAML or JSON)", + type=["yaml", "json"], + key=f"{model_key}_document_upload", + ) + + with_embeddings = st.toggle( + "Import with Embeddings", + value=True, + key=f"{model_key}_import_embeddings", + ) + + if st.button("Import", key=f"{model_key}_btn_import_document"): + if uploaded_file: + try: + file_content = uploaded_file.read().decode( + "utf-8", errors="replace" + ) + if uploaded_file.type == "application/json": + data_to_import = json.loads(file_content) + else: + data_to_import = yaml.safe_load(file_content) + data_to_import = json.dumps(data_to_import, ensure_ascii=False) + except Exception as e: + st.error(f"Error loading file: {e}") + + if data_to_import: + st.info("Importing agent Document...") + result = call_api( + endpoint="action/walker/deepdoc_client_action/import_documents", + json_data={ + "agent_id": agent_id, + "data": data_to_import, + "with_embeddings": with_embeddings, + "purge": purge_collection, + }, + ) + st.write(result) + if result: + st.success("Agent documents imported successfully") + else: + st.error( + "Failed to import document. Ensure valid YAML/JSON format." + ) + else: + st.error( + "No data to import. Please provide valid text or upload a file." + ) + with st.expander("Document List", True): # Initialize session state variables for pagination if "current_page" not in st.session_state: diff --git a/deepdoc_client_action/deepdoc_client_action.jac b/deepdoc_client_action/deepdoc_client_action.jac index bfcbba3..f3cb612 100644 --- a/deepdoc_client_action/deepdoc_client_action.jac +++ b/deepdoc_client_action/deepdoc_client_action.jac @@ -1,5 +1,6 @@ import os; import re; +import json; import requests; import logging; import traceback; @@ -941,10 +942,18 @@ node DeepDocClientAction(Action) { return success; } - def export_collection() -> dict { + def export_collection(with_embeddings:bool=False) -> dict { collection = self.get_collection(); export_collection = collection spawn _export_collection(); - return {"documents": export_collection.documents}; + + vector_store_action = self.get_agent().get_action(action_label=self.vector_store_action); + knodes = vector_store_action.export_knodes( + as_json=True, + with_embeddings=with_embeddings, + with_ids=True + ); + + return {"documents": export_collection.documents, "knodes": json.loads(knodes)}; } def import_collection(data:dict, purge:bool=True) -> bool { @@ -961,6 +970,29 @@ node DeepDocClientAction(Action) { return False; } + def import_documents(data:dict, with_embeddings:bool=False, purge:bool=True) -> bool { + try { + if data.get("documents") { + self.import_collection(data=data, purge=purge); + } + + if data.get("knodes") { + vector_store_action = self.get_agent().get_action(action_label=self.vector_store_action); + if purge { + vector_store_action.delete_collection(); + } + vector_store_action.import_knodes(data=data.get("knodes"), with_embeddings=with_embeddings); + } + } except Exception as e { + self.logger.error(f"Failed to import documents: {str(e)}"); + self.logger.error(traceback.format_exc()); + return False; + } + + return True; + + } + } walker _get_job_entry { @@ -1001,19 +1033,23 @@ walker _export_collection { can on_doc_file_entry with DocFileEntry entry { job_entry_id = [<--](`?JobEntry)[0].job_id; + job_status = [<--](`?JobEntry)[0].status; + if job_entry_id not in self.documents { - self.documents[job_entry_id] = [here.export()]; + self.documents[job_entry_id] = [{"document": here.export(), "job_status": job_status}]; } else { - self.documents[job_entry_id].append(here.export()); + self.documents[job_entry_id].append({"document": here.export(), "job_status": job_status}); } } can on_doc_url_entry with DocURLEntry entry { job_entry_id = [<--](`?JobEntry)[0].job_id; + job_status = [<--](`?JobEntry)[0].status; + if job_entry_id not in self.documents { - self.documents[job_entry_id] = [here.export()]; + self.documents[job_entry_id] = [{"document": here.export(), "job_status": job_status}]; } else { - self.documents[job_entry_id].append(here.export()); + self.documents[job_entry_id].append({"document": here.export(), "job_status": job_status}); } } @@ -1029,16 +1065,18 @@ walker _import_collection { can on_collection with Collection entry { for job_id in self.documents { visit [-->](`?JobEntry)(?job_id == job_id) else { - job_entry = JobEntry(collection_id=here.id, job_id=job_id); + job_entry = JobEntry(collection_id=here.id, job_id=job_id, status=self.documents[job_id][0]["job_status"]); here ++> job_entry; - for document_entry in self.documents[job_id] { + for document in self.documents[job_id] { + document_entry = document.get("document"); + if document_entry.get("mimetype") == "url"{ doc_url_entry = DocURLEntry( collection_id = here.id, job_id = job_id, - status = ItemStatus.PENDING if not job_id else ItemStatus.PROCESSING, + status = document_entry.get("status"), name = document_entry.get("name"), source = document_entry.get("source"), metadata = document_entry.get("metadata") @@ -1051,7 +1089,7 @@ walker _import_collection { doc_file_entry = DocFileEntry( collection_id = here.id, job_id = job_id, - status = ItemStatus.PENDING if not job_id else ItemStatus.PROCESSING, + status = document_entry.get("status"), name = document_entry.get("name"), source = document_entry.get("source"), mimetype = document_entry.get("mimetype"), @@ -1068,12 +1106,14 @@ walker _import_collection { can on_job_entry with JobEntry entry { for document_entry in self.documents[here.job_id] { + document_entry = document.get("document"); + if document_entry.get("mimetype") == "url" and not [-->](`?DocURLEntry)(?name == document_entry.get("name")){ doc_url_entry = DocURLEntry( collection_id = here.collection_id, job_id = here.job_id, - status = ItemStatus.PENDING if not here.job_id else ItemStatus.PROCESSING, + status = document_entry.get("status"), name = document_entry.get("name"), source = document_entry.get("source"), metadata = document_entry.get("metadata") @@ -1086,7 +1126,7 @@ walker _import_collection { doc_file_entry = DocFileEntry( collection_id = here.collection_id, job_id = here.job_id, - status = ItemStatus.PENDING if not here.job_id else ItemStatus.PROCESSING, + status = document_entry.get("status"), name = document_entry.get("name"), source = document_entry.get("source"), mimetype = document_entry.get("mimetype"), @@ -1097,4 +1137,6 @@ walker _import_collection { } } } + + } \ No newline at end of file diff --git a/deepdoc_client_action/export_documents.jac b/deepdoc_client_action/export_documents.jac new file mode 100644 index 0000000..7d96285 --- /dev/null +++ b/deepdoc_client_action/export_documents.jac @@ -0,0 +1,40 @@ +import logging; +import from logging { Logger } +import from jivas.agent.core.agent { Agent } +import from jivas.agent.action.action { Action } +import from jivas.agent.action.actions { Actions } +import from jivas.agent.action.agent_graph_walker { agent_graph_walker } +import from jivas.agent.modules.action.path { action_walker_path } + + +walker export_documents(agent_graph_walker) { + # action endpoint for listing all documents processed by the deepdoc service + has with_embeddings:bool = False; + has response:list[dict] = []; + has reporting:bool = True; + + # set up logger + static has logger:Logger = logging.getLogger(__name__); + + class __specs__ { + static has private: bool = False; + static has path: str = action_walker_path(__module__); + static has excluded: list[str] = ["response"]; # exclude response from the specs + } + + can on_agent with Agent entry { + visit [-->](`?Actions); + } + + can on_actions with Actions entry { + visit [-->](`?Action)(?enabled==True)(?label=='DeepDocClientAction'); + } + + can on_action with Action entry { + self.response = here.export_collection(with_embeddings=self.with_embeddings); + + if self.reporting { + report self.response; + } + } +} \ No newline at end of file diff --git a/deepdoc_client_action/import_documents.jac b/deepdoc_client_action/import_documents.jac new file mode 100644 index 0000000..246a6ad --- /dev/null +++ b/deepdoc_client_action/import_documents.jac @@ -0,0 +1,62 @@ +import logging; +import json; +import yaml; +import from logging { Logger } +import from jivas.agent.core.agent { Agent } +import from jivas.agent.action.action { Action } +import from jivas.agent.action.actions { Actions } +import from jivas.agent.action.agent_graph_walker { agent_graph_walker } +import from jivas.agent.modules.action.path { action_walker_path } + + +walker import_documents(agent_graph_walker) { + # action endpoint for listing all documents processed by the deepdoc service + has with_embeddings:bool = False; + has data:str = ""; + has purge:bool = False; + has response:list[dict] = []; + has reporting:bool = True; + + # set up logger + static has logger:Logger = logging.getLogger(__name__); + + class __specs__ { + static has private: bool = False; + static has path: str = action_walker_path(__module__); + static has excluded: list[str] = ["response"]; # exclude response from the specs + } + + can on_agent with Agent entry { + self.logger.info("agent Importing documents from DeepDocClientAction."); + visit [-->](`?Actions); + } + + can on_actions with Actions entry { + visit [-->](`?Action)(?enabled==True)(?label=='DeepDocClientAction'); + } + + can on_action with Action entry { + self.logger.info("Importing documents from DeepDocClientAction."); + self.logger.info(f"data: {self.data}"); + + if isinstance(self.data, str) { + try { + data = json.loads(self.data); + } except json.JSONDecodeError { + try { + data = yaml.safe_load(self.data); + } except yaml.YAMLError as e { + self.logger.error(f"Invalid data format: {e}"); + return False; + } + } + } else { + data = self.data; + } + self.response = here.import_documents(data=data, with_embeddings=self.with_embeddings, purge=self.purge); + + if self.reporting { + report self.response; + } + } +} \ No newline at end of file diff --git a/deepdoc_client_action/info.yaml b/deepdoc_client_action/info.yaml index 9e675d7..69c06b1 100644 --- a/deepdoc_client_action/info.yaml +++ b/deepdoc_client_action/info.yaml @@ -2,7 +2,7 @@ package: name: jivas/deepdoc_client_action author: V75 Inc. archetype: DeepDocClientAction - version: 0.1.11 + version: 0.1.12 meta: title: DeepDoc Client Action description: Integrates with DeepDoc OCR and document parsing services to ingest documents into a vector store diff --git a/deepdoc_client_action/lib.jac b/deepdoc_client_action/lib.jac index adea674..978a151 100644 --- a/deepdoc_client_action/lib.jac +++ b/deepdoc_client_action/lib.jac @@ -6,5 +6,7 @@ import from actions.jivas.deepdoc_client_action { list_documents, deepdoc_callback, delete_job, - retrieve_job + retrieve_job, + export_documents, + import_documents } \ No newline at end of file From e9a2427d98ae8139f7116a7c66a81e2ae4f7bdbc Mon Sep 17 00:00:00 2001 From: Tharick Jairam Date: Fri, 3 Oct 2025 14:55:30 -0400 Subject: [PATCH 2/4] remove log --- deepdoc_client_action/app/app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepdoc_client_action/app/app.py b/deepdoc_client_action/app/app.py index 7191b16..2eff4c5 100644 --- a/deepdoc_client_action/app/app.py +++ b/deepdoc_client_action/app/app.py @@ -350,7 +350,6 @@ def get_status_badge(status: str) -> str: st.error(f"Error loading file: {e}") if data_to_import: - st.info("Importing agent Document...") result = call_api( endpoint="action/walker/deepdoc_client_action/import_documents", json_data={ @@ -360,7 +359,7 @@ def get_status_badge(status: str) -> str: "purge": purge_collection, }, ) - st.write(result) + if result: st.success("Agent documents imported successfully") else: From 67654bdbe2b36332107b5ecf2834fc72710646cb Mon Sep 17 00:00:00 2001 From: Tharick Jairam Date: Wed, 29 Oct 2025 11:22:31 -0400 Subject: [PATCH 3/4] added toc chunker --- deepdoc_client_action/add_documents.jac | 4 +- deepdoc_client_action/app/app.py | 133 ++++++++++++++++-- .../deepdoc_client_action.jac | 8 +- 3 files changed, 128 insertions(+), 17 deletions(-) diff --git a/deepdoc_client_action/add_documents.jac b/deepdoc_client_action/add_documents.jac index 7bd5f6c..ca4f83f 100644 --- a/deepdoc_client_action/add_documents.jac +++ b/deepdoc_client_action/add_documents.jac @@ -22,6 +22,7 @@ walker add_documents(agent_graph_walker) { has with_embeddings:bool = False; # whether to generate embeddings for the documents has response:str = ""; has reporting:bool = True; + has chunker_type:str = "hybrid"; # set up logger static has logger:Logger = logging.getLogger(__name__); @@ -89,7 +90,8 @@ walker add_documents(agent_graph_walker) { to_page=self.to_page, lang=self.lang, with_embeddings=self.with_embeddings, - callback_url=callback_url + callback_url=callback_url, + chunker_type=self.chunker_type ); if self.reporting { diff --git a/deepdoc_client_action/app/app.py b/deepdoc_client_action/app/app.py index 2eff4c5..37839b6 100644 --- a/deepdoc_client_action/app/app.py +++ b/deepdoc_client_action/app/app.py @@ -22,6 +22,8 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) - :param info: Additional information. """ (model_key, module_root) = app_header(agent_id, action_id, info) + if "job_id_details" not in st.session_state: + st.session_state.job_id_details = "" # add documents section with st.expander("Configure", False): @@ -113,6 +115,11 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) - value=True, ) + chunker_type = st.selectbox( + "Chunker type", + options=["toc", "hybrid", "hierarchical"], + key=f"{model_key}_chunker_type", + ) # Process inputs url_list = [url.strip() for url in doc_urls.split("\n") if url.strip()] metadata_list = [] @@ -150,6 +157,7 @@ def render(router: StreamlitRouter, agent_id: str, action_id: str, info: dict) - "to_page": int(to_page) if to_page is not None else 0, "lang": str(lang), "with_embeddings": with_embeddings, + "chunker_type": chunker_type, } # Add optional fields only if they exist @@ -272,7 +280,7 @@ def get_status_badge(status: str) -> str: color = color_map.get(status, "gray") return f"{status}" - with st.expander("Export document", True): + with st.expander("Export document", False): # Fetch documents with pagination parameters with_embeddings = st.toggle( "Export with Embeddings", value=True, key=f"{model_key}_with_embeddings" @@ -299,7 +307,7 @@ def get_status_badge(status: str) -> str: else: st.error("No job ID returned from the API. Please try again.") - with st.expander("Import document", True): + with st.expander("Import document", False): knode_source = st.radio( "Choose data source:", ("Text input", "Upload file"), @@ -593,13 +601,23 @@ def get_status_badge(status: str) -> str: if st.button("No, Keep Job"): st.session_state.confirm_state = {"active": False} st.rerun() - elif st.button("Delete Job", key=f"delete_job_{job_id}"): - st.session_state.confirm_state = { - "active": True, - "type": "delete_job", - "job_id": job_id, - } - st.rerun() + + elif status == "COMPLETED": + col1, col2 = st.columns(2) + with col1: + if st.button("Delete Job", key=f"delete_job_{job_id}"): + st.session_state.confirm_state = { + "active": True, + "type": "delete_job", + "job_id": job_id, + } + st.rerun() + with col2: + if st.button("View Job", key=f"view_job_{job_id}"): + st.session_state.current_page = 3 + st.session_state.job_id_details = job_id + st.session_state.job_details = documents + st.rerun() # Display each document in the job for document in documents: @@ -718,9 +736,94 @@ def get_status_badge(status: str) -> str: time.sleep(5) st.rerun() - else: - st.info( - "No documents found. Your uploaded documents will be shown here." - ) - else: - st.info("No documents found. Your uploaded documents will be shown here.") + if st.session_state.job_id_details: + st.write("---") + st.write("## Job Details") + + if "page" not in st.session_state[model_key]: + st.session_state[model_key]["page"] = 1 + if "per_page" not in st.session_state[model_key]: + st.session_state[model_key]["per_page"] = 10 + + # Items per page selection + per_page_options = [10, 20, 30, 50, 100] + new_per_page = st.selectbox( + "Documents per page:", + per_page_options, + index=per_page_options.index(st.session_state[model_key]["per_page"]), + ) + + # Reset page if per_page changes + if new_per_page != st.session_state[model_key]["per_page"]: + st.session_state[model_key]["per_page"] = new_per_page + st.session_state[model_key]["page"] = 1 + st.rerun() + + st.session_state[model_key]["pages_input"] = st.text_input( + "Enter page numbers (comma or space separated):", + value="", # optional default value + placeholder="e.g., 1,2,3", + ) + + st.session_state[model_key]["pages_input"] = [ + p.strip() + for p in st.session_state[model_key]["pages_input"] + .replace(",", " ") + .split() + if p.strip().isdigit() + ] + st.session_state[model_key][ + "filter_by" + ] = f'metadata.job_id:="{st.session_state.job_id_details}"' + + if st.session_state[model_key]["pages_input"]: + st.session_state[model_key][ + "filter_by" + ] += f' && metadata.page:=[{",".join(st.session_state[model_key]["pages_input"])}]' + + params = { + "page": st.session_state[model_key].get("page", 1), + "per_page": st.session_state[model_key].get("per_page", 10), + "filter_by": st.session_state[model_key]["filter_by"], + "agent_id": agent_id, + } + + response = call_api( + endpoint="action/walker/typesense_vector_store_action/list_documents", + json_data=params, + ) + + if response and response.status_code == 200: + result = get_reports_payload(response) + documents = result.get("documents", []) + + for doc in documents: + if doc["metadata"].get("title"): + title = doc["metadata"]["title"][0].strip() + else: + title = doc["text"] + title = title.split("\n")[0].strip() + + title = title[:40] + page = doc["metadata"].get("page", "N/A") + + with st.expander(f"{title} (Page {page})", expanded=False): + + st.write(doc["text"]) + st.write("---") + + col1, col2 = st.columns([5, 1]) # first column 5x width of second + with col1: + st.markdown(f"**Page:** {page}") + with col2: + # Delete button + if st.button("Delete", key=f"delete_{doc['id']}"): + args = {"id": doc["id"], "agent_id": agent_id} + result = call_api( + endpoint="action/walker/typesense_vector_store_action/delete_document", + json_data=args, + ) + + if result and result.status_code == 200: + get_reports_payload(result) + st.rerun() diff --git a/deepdoc_client_action/deepdoc_client_action.jac b/deepdoc_client_action/deepdoc_client_action.jac index f3cb612..7f22b10 100644 --- a/deepdoc_client_action/deepdoc_client_action.jac +++ b/deepdoc_client_action/deepdoc_client_action.jac @@ -77,7 +77,8 @@ node DeepDocClientAction(Action) { to_page:int=100000, lang:str="english", with_embeddings:bool=False, - callback_url:str="" + callback_url:str="", + chunker_type:str="hybrid" ) -> str { # """ @@ -108,6 +109,7 @@ node DeepDocClientAction(Action) { "to_page": to_page, "lang": lang, "with_embeddings": with_embeddings, # default to False, can be overridden + "chunker_type": chunker_type }; # Include the callback URL if provided @@ -505,6 +507,10 @@ node DeepDocClientAction(Action) { chunk_metadata["page"] = self.format_page_range(result_page_nums); # Process bbox, if present chunk_metadata["bbox"] = result.get("metadata", {}).get("bbox", []); + # adding title + chunk_metadata["title"] = result.get("metadata", {}).get("headings", []); + # adding hierarchy + chunk_metadata["hierarchy"] = result.get("metadata", {}).get("hierarchy", []); # Add to batch texts.append(text); metadatas.append(chunk_metadata); From de64a538ede724132338b9c5eca7e83a39bf7fe1 Mon Sep 17 00:00:00 2001 From: Tharick Jairam Date: Wed, 29 Oct 2025 11:22:52 -0400 Subject: [PATCH 4/4] update change log --- deepdoc_client_action/CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepdoc_client_action/CHANGELOG.md b/deepdoc_client_action/CHANGELOG.md index c02cd06..d6dff4f 100644 --- a/deepdoc_client_action/CHANGELOG.md +++ b/deepdoc_client_action/CHANGELOG.md @@ -77,4 +77,5 @@ - Increase timeout and add logs # 0.1.12 -- Added import and export of documents and knodes \ No newline at end of file +- Added import and export of documents and knodes +- Added TOCChunker \ No newline at end of file