Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 28 additions & 12 deletions sources/semanticscholar_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,26 +54,42 @@ def __init__(self, source_args: dict, llm_config: LLMConfig, common_config: Comm
self._save_fetch_cache(cache_key, self.raw_papers)

def _derive_queries_from_description(self) -> list[str]:
"""Extract up to 3 search queries from the user description."""
"""Extract search queries from the user's research description."""
import re
desc = self.description.strip()
if not desc:
return ["artificial intelligence"]

lines = [line.strip().lstrip("0123456789.-) ") for line in desc.split("\n") if line.strip()]
queries = []
for line in lines:
# Skip negative preference lines
in_interest_section = False

for line in desc.split("\n"):
Comment on lines +64 to +66
Comment on lines +64 to +66
line = line.strip()
if not line:
continue
lower = line.lower()

# "not interested" signals the end of the interest section
if any(neg in lower for neg in ("not interested", "不感兴趣", "don't", "exclude")):
in_interest_section = False
continue
# Clean up common prefixes
for prefix in ("i'm interested in", "interested in", "关注", "研究"):
if lower.startswith(prefix):
line = line[len(prefix):].strip(" ::-")
if line and len(line) > 2:
queries.append(line[:120])
if len(queries) >= 3:
break

# Lines containing "interest" signal the start of the interest section
if re.search(r'\binterest', lower) or "关注" in lower or "研究" in lower:
in_interest_section = True
continue # header line itself is not a topic
Comment on lines +77 to +80

# Within the interest section, extract from numbered items
if in_interest_section:
m = re.match(r'^\d+[\.\)\-:、]\s*(.*)', line)
if m:
content = m.group(1)
Comment on lines +82 to +86
for sep in ("—", "–", " - ", ":"):
if sep in content:
content = content.split(sep, 1)[0].strip()
break
if content and len(content) > 1:
queries.append(content[:120])

return queries or ["artificial intelligence"]
Comment on lines +91 to 94

Expand Down