diff --git a/examples/herc2/classify_herc2.py b/examples/herc2/classify_herc2.py index fc5112d..393aae1 100644 --- a/examples/herc2/classify_herc2.py +++ b/examples/herc2/classify_herc2.py @@ -10,6 +10,15 @@ # (A;G) also tends toward brown. # (G;G) gives blue eye color ~99% of the time. +def _format_allele_label(allele): + if isinstance(allele, Alleles): + return ",".join(sorted(a.value for a in allele)) + if isinstance(allele, str): + return allele + if hasattr(allele, "__iter__"): + return ",".join(sorted(str(a) for a in allele)) + return str(allele) + class HERC2Classifier(GenotypeClassifier): def classify(self, matches): match = matches.get(rs12913832) @@ -25,10 +34,14 @@ def classify(self, matches): result = "No call" genotype_sorted = None match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 else: genotype_sorted = match.genotype_sorted result = eye_color_map.get(genotype_sorted, "Unknown") match_type = match.match_type.value + ref_count = match.ref_count + alt_count = match.alt_count # Extract properties from match (source_row has the actual data from the file) if match and match.source_row: @@ -41,15 +54,21 @@ def classify(self, matches): position = None # Create report row using match properties + ref_label = _format_allele_label(rs12913832.ref) + alt_label = _format_allele_label(rs12913832.alt) report_row = { "participant_id": self.participant_id, "filename": self.filename, "rsid": rsid, "chromosome": chromosome, "position": position, + "ref": ref_label, + "alt": alt_label, "genotype": genotype_sorted, "match_type": match_type, "eye_color": result, + "ref_count": ref_count, + "alt_count": alt_count, } # Write to TSV file (as a list with one row) @@ -93,6 +112,8 @@ def test_brown_homozygous(): assert result[0]["rsid"] == "rs12913832" assert result[0]["chromosome"] == "15" assert result[0]["position"] == 28120472 + assert result[0]["ref_count"] == 2 + assert result[0]["alt_count"] == 0 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -104,6 +125,8 @@ def test_brown_heterozygous_unsorted(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 1 + assert result[0]["alt_count"] == 1 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -115,6 +138,8 @@ def test_blue_homozygous(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 0 + assert result[0]["alt_count"] == 2 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -126,11 +151,13 @@ def test_no_call(): assert result[0]["match_type"] == MatchType.NO_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 0 + assert result[0]["alt_count"] == 0 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") def test_unexpected_allele_c(): - """Test handling of unexpected C allele (not in reference map).""" + 'Test handling of unexpected C allele (not in reference map).' result = classify_fixture("AC") assert len(result) == 1 assert result[0]["eye_color"] == "Unknown" @@ -138,5 +165,7 @@ def test_unexpected_allele_c(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 1 + assert result[0]["alt_count"] == 1 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") diff --git a/examples/herc2/herc2-classifier/assets/aggregate_population_stats.py b/examples/herc2/herc2-classifier/assets/aggregate_population_stats.py new file mode 100644 index 0000000..6a940e7 --- /dev/null +++ b/examples/herc2/herc2-classifier/assets/aggregate_population_stats.py @@ -0,0 +1,144 @@ +import argparse +import csv +import re +from collections import defaultdict + +def parse_args(): + parser = argparse.ArgumentParser(description="Aggregate population allele statistics") + parser.add_argument("--input", required=True, help="Input TSV from combined classifier output") + parser.add_argument("--output", required=True, help="Output TSV path") + return parser.parse_args() + +def locus_key_for_row(row): + chrom = (row.get("chromosome") or "").strip() + pos = (row.get("position") or "").strip() + ref = (row.get("ref") or "").strip() + alt = (row.get("alt") or "").strip() + if not (chrom and pos and ref and alt): + return None + return f"{chrom}-{pos}-{ref}-{alt}" + +def to_int(value): + try: + return int(value) + except Exception: + return 0 + +def parse_alleles(genotype, alleles): + if not genotype: + return [] + genotype = genotype.strip() + if not genotype: + return [] + + # Split on common genotype separators + if "/" in genotype or "|" in genotype: + parts = [p for p in re.split(r"[\/|]", genotype) if p] + return parts + + # If alleles are same length and genotype is a concat of two alleles, split evenly + if alleles: + lengths = {len(a) for a in alleles if a} + if len(lengths) == 1: + allele_len = lengths.pop() + if allele_len > 0 and len(genotype) == 2 * allele_len: + return [genotype[:allele_len], genotype[allele_len:]] + + # Fallback for single-base alleles like "AA" or "AG" + if len(genotype) == 2: + return [genotype[0], genotype[1]] + + return [genotype] + +def main(): + args = parse_args() + + # stats keyed by (locus_base, allele) + stats = defaultdict(lambda: { + "allele_count": 0, + "num_homo": 0, + "num_hetero": 0, + "rsid": None, + }) + locus_called_samples = defaultdict(int) + + with open(args.input, "r", encoding="utf-8") as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + match_type = (row.get("match_type") or "").strip() + if match_type.lower() == "no call": + continue + + locus_key = locus_key_for_row(row) + if locus_key is None: + continue + + alt_count = to_int(row.get("alt_count")) + ref_count = to_int(row.get("ref_count")) + allele_number = ref_count + alt_count + + if allele_number <= 0: + continue + + chrom = (row.get("chromosome") or "").strip() + pos = (row.get("position") or "").strip() + ref = (row.get("ref") or "").strip() + alt = (row.get("alt") or "").strip() + locus_base = f"{chrom}-{pos}" + + alleles = [] + if ref: + alleles.append(ref) + if alt: + alleles.extend([a.strip() for a in alt.split(",") if a.strip()]) + + genotype = (row.get("genotype") or "").strip() + gt_alleles = parse_alleles(genotype, alleles) + if not gt_alleles: + continue + + locus_called_samples[locus_base] += 1 + + rsid = row.get("rsid") + is_homo = len(gt_alleles) == 2 and gt_alleles[0] == gt_alleles[1] + + for allele in gt_alleles: + key = (locus_base, allele) + entry = stats[key] + entry["allele_count"] += 1 + if is_homo and allele == gt_alleles[0]: + entry["num_homo"] += 1 + elif not is_homo: + entry["num_hetero"] += 1 + if rsid and not entry["rsid"]: + entry["rsid"] = rsid + + with open(args.output, "w", encoding="utf-8", newline="") as out_fh: + fieldnames = [ + "locus_key", + "allele_count", + "allele_number", + "num_homo", + "num_hetero", + "allele_freq", + "rsid", + ] + writer = csv.DictWriter(out_fh, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + for (locus_base, allele) in sorted(stats.keys()): + entry = stats[(locus_base, allele)] + called = locus_called_samples.get(locus_base, 0) + allele_number = called * 2 + allele_freq = (entry["allele_count"] / allele_number) if allele_number else 0 + writer.writerow({ + "locus_key": f"{locus_base}-{allele}", + "allele_count": entry["allele_count"], + "allele_number": allele_number, + "num_homo": entry["num_homo"], + "num_hetero": entry["num_hetero"], + "allele_freq": f"{allele_freq:.6f}", + "rsid": entry["rsid"], + }) + +if __name__ == "__main__": + main() diff --git a/examples/herc2/herc2-classifier/assets/classify_herc2.py b/examples/herc2/herc2-classifier/assets/classify_herc2.py index fc5112d..393aae1 100644 --- a/examples/herc2/herc2-classifier/assets/classify_herc2.py +++ b/examples/herc2/herc2-classifier/assets/classify_herc2.py @@ -10,6 +10,15 @@ # (A;G) also tends toward brown. # (G;G) gives blue eye color ~99% of the time. +def _format_allele_label(allele): + if isinstance(allele, Alleles): + return ",".join(sorted(a.value for a in allele)) + if isinstance(allele, str): + return allele + if hasattr(allele, "__iter__"): + return ",".join(sorted(str(a) for a in allele)) + return str(allele) + class HERC2Classifier(GenotypeClassifier): def classify(self, matches): match = matches.get(rs12913832) @@ -25,10 +34,14 @@ def classify(self, matches): result = "No call" genotype_sorted = None match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 else: genotype_sorted = match.genotype_sorted result = eye_color_map.get(genotype_sorted, "Unknown") match_type = match.match_type.value + ref_count = match.ref_count + alt_count = match.alt_count # Extract properties from match (source_row has the actual data from the file) if match and match.source_row: @@ -41,15 +54,21 @@ def classify(self, matches): position = None # Create report row using match properties + ref_label = _format_allele_label(rs12913832.ref) + alt_label = _format_allele_label(rs12913832.alt) report_row = { "participant_id": self.participant_id, "filename": self.filename, "rsid": rsid, "chromosome": chromosome, "position": position, + "ref": ref_label, + "alt": alt_label, "genotype": genotype_sorted, "match_type": match_type, "eye_color": result, + "ref_count": ref_count, + "alt_count": alt_count, } # Write to TSV file (as a list with one row) @@ -93,6 +112,8 @@ def test_brown_homozygous(): assert result[0]["rsid"] == "rs12913832" assert result[0]["chromosome"] == "15" assert result[0]["position"] == 28120472 + assert result[0]["ref_count"] == 2 + assert result[0]["alt_count"] == 0 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -104,6 +125,8 @@ def test_brown_heterozygous_unsorted(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 1 + assert result[0]["alt_count"] == 1 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -115,6 +138,8 @@ def test_blue_homozygous(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 0 + assert result[0]["alt_count"] == 2 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") @@ -126,11 +151,13 @@ def test_no_call(): assert result[0]["match_type"] == MatchType.NO_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 0 + assert result[0]["alt_count"] == 0 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") def test_unexpected_allele_c(): - """Test handling of unexpected C allele (not in reference map).""" + 'Test handling of unexpected C allele (not in reference map).' result = classify_fixture("AC") assert len(result) == 1 assert result[0]["eye_color"] == "Unknown" @@ -138,5 +165,7 @@ def test_unexpected_allele_c(): assert result[0]["match_type"] == MatchType.VARIANT_CALL.value assert result[0]["participant_id"] == "TEST_ID" assert result[0]["filename"] == "test.txt" + assert result[0]["ref_count"] == 1 + assert result[0]["alt_count"] == 1 # Cleanup os.remove("result_HERC2_TEST_ID.tsv") diff --git a/examples/herc2/herc2-classifier/flow.yaml b/examples/herc2/herc2-classifier/flow.yaml index 797846d..d0eb9eb 100644 --- a/examples/herc2/herc2-classifier/flow.yaml +++ b/examples/herc2/herc2-classifier/flow.yaml @@ -4,22 +4,23 @@ metadata: name: herc2-classifier version: 0.1.1 spec: + steps: + - id: herc2 + uses: + source: + kind: local + path: ./ + with: + participants: inputs.samplesheet + publish: + classification_result: File(result_HERC2.tsv) + population_stats: File(result_HERC2_stats.tsv) + store: + counts_sql: + kind: sql + source: classification_result + table: herc2_{run_id} + key_column: participant_id inputs: samplesheet: type: List[GenotypeRecord] - steps: - - id: herc2 - uses: - source: - kind: local - path: ./ - with: - participants: inputs.samplesheet - publish: - classification_result: File(result_HERC2.tsv) - store: - counts_sql: - kind: sql - source: classification_result - table: herc2_{run_id} - key_column: participant_id diff --git a/examples/herc2/herc2-classifier/module.yaml b/examples/herc2/herc2-classifier/module.yaml index 21224f0..1519428 100644 --- a/examples/herc2/herc2-classifier/module.yaml +++ b/examples/herc2/herc2-classifier/module.yaml @@ -3,30 +3,38 @@ kind: Module metadata: name: herc2-classifier version: 0.1.1 - description: Classification of HERC2 genotypes for eye color prediction. authors: - - madhava@openmined.org + - madhava@openmined.org + description: Classification of HERC2 genotypes for eye color prediction. spec: runner: kind: nextflow entrypoint: workflow.nf template: dynamic-nextflow - assets: - - path: classify_herc2.py + image: ghcr.io/openmined/bioscript:0.1.6 inputs: - - name: participants - type: List[GenotypeRecord] - description: CSV/TSV with participant_id and genotype_file columns - format: - kind: csv - mapping: - participant_id: participant_id - genotype_file: genotype_file + - name: participants + type: List[GenotypeRecord] + description: CSV/TSV with participant_id and genotype_file columns + format: + kind: csv + mapping: + participant_id: participant_id + genotype_file: genotype_file outputs: - - name: classification_result - type: File - description: HERC2 eye color classification (aggregated) - format: - kind: tsv - path: result_HERC2.tsv + - name: classification_result + type: File + description: HERC2 eye color classification (aggregated) + format: + kind: tsv + path: result_HERC2.tsv + - name: population_stats + type: File + description: HERC2 population allele statistics (aggregated) + format: + kind: tsv + path: result_HERC2_stats.tsv parameters: [] + assets: + - path: classify_herc2.py + - path: aggregate_population_stats.py diff --git a/examples/herc2/herc2-classifier/workflow.nf b/examples/herc2/herc2-classifier/workflow.nf index ff56507..f01addc 100644 --- a/examples/herc2/herc2-classifier/workflow.nf +++ b/examples/herc2/herc2-classifier/workflow.nf @@ -33,8 +33,15 @@ workflow USER { per_participant_results.collect() ) + // Aggregate population statistics + def population_stats_ch = aggregate_population_stats( + Channel.value(assetsDirPath), + aggregated + ) + emit: classification_result = aggregated + population_stats = population_stats_ch } process herc2_classifier { @@ -74,3 +81,20 @@ process aggregate_results { bioscript combine --list results.list --output result_HERC2.tsv """ } + +process aggregate_population_stats { + container 'ghcr.io/openmined/bioscript:0.1.6' + publishDir params.results_dir, mode: 'copy', overwrite: true + + input: + path assets_dir + path aggregated_results + + output: + path "result_HERC2_stats.tsv" + + script: + """ + python3 "${assets_dir}/aggregate_population_stats.py" --input "${aggregated_results}" --output result_HERC2_stats.tsv + """ +} diff --git a/examples/herc2/herc2_dev.ipynb b/examples/herc2/herc2_dev.ipynb index 500fca5..5cee6a5 100644 --- a/examples/herc2/herc2_dev.ipynb +++ b/examples/herc2/herc2_dev.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "66a0c985-275c-4213-902b-45f5c188d9dd", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b025ff95-b911-4b42-8f63-35567637dfb9", "metadata": {}, "outputs": [], @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "014b798d-a680-461b-822b-b3c72c504488", "metadata": {}, "outputs": [], @@ -50,11 +50,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "fbf79259-7445-4284-a17a-a8f584914f80", "metadata": {}, "outputs": [], "source": [ + "def _format_allele_label(allele):\n", + " if isinstance(allele, Alleles):\n", + " return \",\".join(sorted(a.value for a in allele))\n", + " if isinstance(allele, str):\n", + " return allele\n", + " if hasattr(allele, \"__iter__\"):\n", + " return \",\".join(sorted(str(a) for a in allele))\n", + " return str(allele)\n", + "\n", "class HERC2Classifier(GenotypeClassifier):\n", " def classify(self, matches):\n", " match = matches.get(rs12913832)\n", @@ -70,10 +79,14 @@ " result = \"No call\"\n", " genotype_sorted = None\n", " match_type = MatchType.NO_CALL.value\n", + " ref_count = 0\n", + " alt_count = 0\n", " else:\n", " genotype_sorted = match.genotype_sorted\n", " result = eye_color_map.get(genotype_sorted, \"Unknown\")\n", " match_type = match.match_type.value\n", + " ref_count = match.ref_count\n", + " alt_count = match.alt_count\n", " \n", " # Extract properties from match (source_row has the actual data from the file)\n", " if match and match.source_row:\n", @@ -86,27 +99,33 @@ " position = None\n", " \n", " # Create report row using match properties\n", + " ref_label = _format_allele_label(rs12913832.ref)\n", + " alt_label = _format_allele_label(rs12913832.alt)\n", " report_row = {\n", " \"participant_id\": self.participant_id,\n", " \"filename\": self.filename,\n", " \"rsid\": rsid,\n", " \"chromosome\": chromosome,\n", " \"position\": position,\n", + " \"ref\": ref_label,\n", + " \"alt\": alt_label,\n", " \"genotype\": genotype_sorted,\n", " \"match_type\": match_type,\n", " \"eye_color\": result,\n", + " \"ref_count\": ref_count,\n", + " \"alt_count\": alt_count,\n", " }\n", " \n", " # Write to TSV file (as a list with one row)\n", " write_tsv(f\"{self.output_basename}.tsv\", [report_row])\n", " \n", " # Return list for testing (consistent with BRCA)\n", - " return [report_row]" + " return [report_row]\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9db0d015-66d4-4564-ae99-e1353dad1aef", "metadata": {}, "outputs": [], @@ -120,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2794c5cf-6361-4743-ac7c-10dd370b47f2", "metadata": {}, "outputs": [], @@ -138,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "48cdc397-62fd-473e-af2f-fec6d1ac21c8", "metadata": {}, "outputs": [], @@ -162,6 +181,8 @@ " assert result[0][\"rsid\"] == \"rs12913832\"\n", " assert result[0][\"chromosome\"] == \"15\"\n", " assert result[0][\"position\"] == 28120472\n", + " assert result[0][\"ref_count\"] == 2\n", + " assert result[0][\"alt_count\"] == 0\n", " # Cleanup\n", " os.remove(\"result_HERC2_TEST_ID.tsv\")\n", "\n", @@ -173,6 +194,8 @@ " assert result[0][\"match_type\"] == MatchType.VARIANT_CALL.value\n", " assert result[0][\"participant_id\"] == \"TEST_ID\"\n", " assert result[0][\"filename\"] == \"test.txt\"\n", + " assert result[0][\"ref_count\"] == 1\n", + " assert result[0][\"alt_count\"] == 1\n", " # Cleanup\n", " os.remove(\"result_HERC2_TEST_ID.tsv\")\n", "\n", @@ -184,6 +207,8 @@ " assert result[0][\"match_type\"] == MatchType.VARIANT_CALL.value\n", " assert result[0][\"participant_id\"] == \"TEST_ID\"\n", " assert result[0][\"filename\"] == \"test.txt\"\n", + " assert result[0][\"ref_count\"] == 0\n", + " assert result[0][\"alt_count\"] == 2\n", " # Cleanup\n", " os.remove(\"result_HERC2_TEST_ID.tsv\")\n", "\n", @@ -195,11 +220,13 @@ " assert result[0][\"match_type\"] == MatchType.NO_CALL.value\n", " assert result[0][\"participant_id\"] == \"TEST_ID\"\n", " assert result[0][\"filename\"] == \"test.txt\"\n", + " assert result[0][\"ref_count\"] == 0\n", + " assert result[0][\"alt_count\"] == 0\n", " # Cleanup\n", " os.remove(\"result_HERC2_TEST_ID.tsv\")\n", "\n", "def test_unexpected_allele_c():\n", - " \"\"\"Test handling of unexpected C allele (not in reference map).\"\"\"\n", + " 'Test handling of unexpected C allele (not in reference map).'\n", " result = classify_fixture(\"AC\")\n", " assert len(result) == 1\n", " assert result[0][\"eye_color\"] == \"Unknown\"\n", @@ -207,16 +234,26 @@ " assert result[0][\"match_type\"] == MatchType.VARIANT_CALL.value\n", " assert result[0][\"participant_id\"] == \"TEST_ID\"\n", " assert result[0][\"filename\"] == \"test.txt\"\n", + " assert result[0][\"ref_count\"] == 1\n", + " assert result[0][\"alt_count\"] == 1\n", " # Cleanup\n", - " os.remove(\"result_HERC2_TEST_ID.tsv\")" + " os.remove(\"result_HERC2_TEST_ID.tsv\")\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "d3a03b87-9882-49e5-bdb7-55a3f7834770", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ All tests passed!\n" + ] + } + ], "source": [ "# Run tests\n", "test_brown_homozygous()\n", @@ -229,10 +266,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "b36a3d84-91a6-4a8f-84e0-6c43f6055c23", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Testing: classify_herc2.py\n", + "============================================================\n", + "Running tests with pytest: classify_herc2.py\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/.venv/bin/python3\n", + "cachedir: .pytest_cache\n", + "rootdir: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2\n", + "plugins: anyio-4.12.1\n", + "collected 5 items \u001b[0m\n", + "\n", + "classify_herc2.py::test_brown_homozygous \u001b[32mPASSED\u001b[0m\u001b[32m [ 20%]\u001b[0m\n", + "classify_herc2.py::test_brown_heterozygous_unsorted \u001b[32mPASSED\u001b[0m\u001b[32m [ 40%]\u001b[0m\n", + "classify_herc2.py::test_blue_homozygous \u001b[32mPASSED\u001b[0m\u001b[32m [ 60%]\u001b[0m\n", + "classify_herc2.py::test_no_call \u001b[32mPASSED\u001b[0m\u001b[32m [ 80%]\u001b[0m\n", + "classify_herc2.py::test_unexpected_allele_c \u001b[32mPASSED\u001b[0m\u001b[32m [100%]\u001b[0m\n", + "\n", + "\u001b[32m============================== \u001b[32m\u001b[1m5 passed\u001b[0m\u001b[32m in 0.01s\u001b[0m\u001b[32m ===============================\u001b[0m\n" + ] + } + ], "source": [ "from bioscript import export_from_notebook\n", "export_from_notebook(\"herc2_dev.ipynb\", \"classify_herc2.py\")\n", @@ -242,10 +305,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "hrvdxb063", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test file created: herc2_test_ga.tsv\n", + "Content:\n", + "# rsid\tchromosome\tposition\tgenotype\n", + "rs12913832\t15\t28120472\tGA\n", + "✓ Test passed and file created!\n" + ] + } + ], "source": [ "# Rewritten to write the raw line to a file and use bioscript classify in the next cell\n", "from pathlib import Path\n", @@ -269,10 +344,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "c51e4f6f-9900-4e7c-844e-211a165f69ef", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2\n", + "[bioscript] Provided SNP file argument: herc2_test_ga.tsv\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, __pycache__, classify_herc2.py, herc2-classifier, herc2_dev.ipynb, herc2_test_ga.tsv, result_HERC2_X.tsv, result_HERC2_test_user.tsv\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv\n", + "participant_id=X\n", + "HERC2_count=1\n", + "participant_id\tfilename\trsid\tchromosome\tposition\tref\talt\tgenotype\tmatch_type\teye_color\tref_count\talt_count\n", + "X\therc2_test_ga.tsv\trs12913832\t15\t28120472\tA\tC,G,T,U\tAG\tVariant call\tBrown\t1\t1\n" + ] + } + ], "source": [ "!bioscript classify classify_herc2.py --file herc2_test_ga.tsv --participant_id=\"X\"\n", "!cat result_HERC2_X.tsv" @@ -280,10 +373,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "7830775c-a21a-40f6-acd1-262261e3b0f6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2\n", + "[bioscript] Provided SNP file argument: herc2_test_ga.tsv\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, __pycache__, classify_herc2.py, herc2-classifier, herc2_dev.ipynb, herc2_test_ga.tsv, result_HERC2_X.tsv, result_HERC2_test_user.tsv\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv\n", + "{\n", + " \"participant_id\": \"test_user\",\n", + " \"HERC2_count\": 1,\n", + " \"HERC2_data\": [\n", + " {\n", + " \"participant_id\": \"test_user\",\n", + " \"filename\": \"herc2_test_ga.tsv\",\n", + " \"rsid\": \"rs12913832\",\n", + " \"chromosome\": \"15\",\n", + " \"position\": 28120472,\n", + " \"ref\": \"A\",\n", + " \"alt\": \"C,G,T,U\",\n", + " \"genotype\": \"AG\",\n", + " \"match_type\": \"Variant call\",\n", + " \"eye_color\": \"Brown\",\n", + " \"ref_count\": 1,\n", + " \"alt_count\": 1\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], "source": [ "# Test JSON output to show the new consistent naming convention\n", "!bioscript classify classify_herc2.py --file herc2_test_ga.tsv --out json --participant_id=\"test_user\"" @@ -299,10 +426,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "5c543421", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultProject(name='herc2-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of HERC2 genotypes for eye color prediction.', template=, version='0.1.1', assets=['classify_herc2.py', 'aggregate_population_stats.py'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='HERC2 eye color classification (aggregated)', format='tsv', path='result_HERC2.tsv', cli_flag=None), Output(name='population_stats', type='File', description='HERC2 population allele statistics (aggregated)', format='tsv', path='result_HERC2_stats.tsv', cli_flag=None)], processes=[ProcessDefinition(name='herc2_classifier', script='classify_herc2.py', container='ghcr.io/openmined/bioscript:0.1.6', kind='bioscript')], docker_image='ghcr.io/openmined/bioscript:0.1.6', docker_platform='linux/amd64')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pathlib import Path\n", "from bioscript import export_bioscript_workflow\n", @@ -313,7 +451,7 @@ " workflow_name='herc2-classifier',\n", " author='madhava@openmined.org',\n", " target_dir=\"./\",\n", - " assets={},\n", + " assets=['herc2-classifier/assets/aggregate_population_stats.py'],\n", " inputs=[\n", " {\n", " 'name': 'participants',\n", @@ -334,19 +472,96 @@ " 'format': 'tsv',\n", " 'path': 'result_HERC2.tsv',\n", " },\n", + " {\n", + " 'name': 'population_stats',\n", + " 'type': 'File',\n", + " 'description': 'HERC2 population allele statistics (aggregated)',\n", + " 'format': 'tsv',\n", + " 'path': 'result_HERC2_stats.tsv',\n", + " },\n", " ],\n", " version=\"0.1.1\",\n", " description=\"Classification of HERC2 genotypes for eye color prediction.\",\n", ")\n", - "project" + "project\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, + "id": "730842a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2819" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "workflow_path = Path('herc2-classifier/workflow.nf')\n", + "if not workflow_path.exists():\n", + " raise FileNotFoundError('workflow.nf not found. Run the export cell first.')\n", + "\n", + "text = workflow_path.read_text(encoding='utf-8')\n", + "\n", + "if 'aggregate_population_stats' not in text:\n", + " text = text.replace(\n", + " \" def aggregated = aggregate_results(\\n per_participant_results.collect()\\n )\\n\\n emit:\\n classification_result = aggregated\\n}\\n\",\n", + " \" def aggregated = aggregate_results(\\n per_participant_results.collect()\\n )\\n\\n // Aggregate population statistics\\n def population_stats_ch = aggregate_population_stats(\\n Channel.value(assetsDirPath),\\n aggregated\\n )\\n\\n emit:\\n classification_result = aggregated\\n population_stats = population_stats_ch\\n}\\n\",\n", + " )\n", + "\n", + "process_block = '''\n", + "process aggregate_population_stats {\n", + " container 'ghcr.io/openmined/bioscript:0.1.6'\n", + " publishDir params.results_dir, mode: 'copy', overwrite: true\n", + "\n", + " input:\n", + " path assets_dir\n", + " path aggregated_results\n", + "\n", + " output:\n", + " path \\\"result_HERC2_stats.tsv\\\"\n", + "\n", + " script:\n", + " \"\"\"\n", + " python3 \"${assets_dir}/aggregate_population_stats.py\" \\\n", + " --input \"${aggregated_results}\" \\\n", + " --output result_HERC2_stats.tsv\n", + " \"\"\"\n", + "}\n", + "'''\n", + "\n", + "if 'process aggregate_population_stats' not in text:\n", + " text += process_block\n", + "\n", + "workflow_path.write_text(text, encoding='utf-8')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "3d78cdd5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultPipeline(name='herc2-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='herc2', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_HERC2.tsv)', 'population_stats': 'File(result_HERC2_stats.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='herc2_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore\n", "\n", @@ -365,6 +580,7 @@ " },\n", " publish={\n", " 'classification_result': 'File(result_HERC2.tsv)',\n", + " 'population_stats': 'File(result_HERC2_stats.tsv)',\n", " },\n", " store={\n", " 'counts_sql': SQLStore(\n", @@ -398,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/python/src/bioscript/biovault.py b/python/src/bioscript/biovault.py index 662ff46..58a6b1f 100644 --- a/python/src/bioscript/biovault.py +++ b/python/src/bioscript/biovault.py @@ -760,10 +760,11 @@ def _generate_participant_workflow_nf(self, entrypoint: Optional[str] = None) -> if "{participant_id}" in output_spec.path: individual_pattern = output_spec.path.replace("{participant_id}", "*") else: - aggregated_path = output_spec.path - # Extract classifier name from aggregated path (e.g., result_HERC2.tsv -> HERC2) - if aggregated_path.startswith("result_") and aggregated_path.endswith(".tsv"): - classifier_name = aggregated_path[7:-4] # Remove "result_" and ".tsv" + if aggregated_path is None: + aggregated_path = output_spec.path + # Extract classifier name from aggregated path (e.g., result_HERC2.tsv -> HERC2) + if aggregated_path.startswith("result_") and aggregated_path.endswith(".tsv"): + classifier_name = aggregated_path[7:-4] # Remove "result_" and ".tsv" if not classifier_name: classifier_name = self.name.upper().replace("-", "_").replace(" ", "_") @@ -1077,10 +1078,14 @@ def export( raise FileNotFoundError(f"Asset source not found: {src_path}") dst_path = assets_dir / asset dst_path.parent.mkdir(parents=True, exist_ok=True) - if src_path.is_dir(): - shutil.copytree(src_path, dst_path, dirs_exist_ok=True) - else: - shutil.copy2(src_path, dst_path) + try: + if src_path.is_dir(): + shutil.copytree(src_path, dst_path, dirs_exist_ok=True) + else: + shutil.copy2(src_path, dst_path) + except shutil.SameFileError: + # Asset already in place (common when exporting into project dir) + pass copied_assets.add(asset) # Export notebook if provided