Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion examples/herc2/classify_herc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
# (A;G) also tends toward brown.
# (G;G) gives blue eye color ~99% of the time.

def _format_allele_label(allele):
if isinstance(allele, Alleles):
return ",".join(sorted(a.value for a in allele))
if isinstance(allele, str):
return allele
if hasattr(allele, "__iter__"):
return ",".join(sorted(str(a) for a in allele))
return str(allele)

class HERC2Classifier(GenotypeClassifier):
def classify(self, matches):
match = matches.get(rs12913832)
Expand All @@ -25,10 +34,14 @@ def classify(self, matches):
result = "No call"
genotype_sorted = None
match_type = MatchType.NO_CALL.value
ref_count = 0
alt_count = 0
else:
genotype_sorted = match.genotype_sorted
result = eye_color_map.get(genotype_sorted, "Unknown")
match_type = match.match_type.value
ref_count = match.ref_count
alt_count = match.alt_count

# Extract properties from match (source_row has the actual data from the file)
if match and match.source_row:
Expand All @@ -41,15 +54,21 @@ def classify(self, matches):
position = None

# Create report row using match properties
ref_label = _format_allele_label(rs12913832.ref)
alt_label = _format_allele_label(rs12913832.alt)
report_row = {
"participant_id": self.participant_id,
"filename": self.filename,
"rsid": rsid,
"chromosome": chromosome,
"position": position,
"ref": ref_label,
"alt": alt_label,
"genotype": genotype_sorted,
"match_type": match_type,
"eye_color": result,
"ref_count": ref_count,
"alt_count": alt_count,
}

# Write to TSV file (as a list with one row)
Expand Down Expand Up @@ -93,6 +112,8 @@ def test_brown_homozygous():
assert result[0]["rsid"] == "rs12913832"
assert result[0]["chromosome"] == "15"
assert result[0]["position"] == 28120472
assert result[0]["ref_count"] == 2
assert result[0]["alt_count"] == 0
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -104,6 +125,8 @@ def test_brown_heterozygous_unsorted():
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 1
assert result[0]["alt_count"] == 1
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -115,6 +138,8 @@ def test_blue_homozygous():
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 0
assert result[0]["alt_count"] == 2
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -126,17 +151,21 @@ def test_no_call():
assert result[0]["match_type"] == MatchType.NO_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 0
assert result[0]["alt_count"] == 0
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

def test_unexpected_allele_c():
"""Test handling of unexpected C allele (not in reference map)."""
'Test handling of unexpected C allele (not in reference map).'
result = classify_fixture("AC")
assert len(result) == 1
assert result[0]["eye_color"] == "Unknown"
assert result[0]["genotype"] == "AC"
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 1
assert result[0]["alt_count"] == 1
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")
144 changes: 144 additions & 0 deletions examples/herc2/herc2-classifier/assets/aggregate_population_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import argparse
import csv
import re
from collections import defaultdict

def parse_args():
parser = argparse.ArgumentParser(description="Aggregate population allele statistics")
parser.add_argument("--input", required=True, help="Input TSV from combined classifier output")
parser.add_argument("--output", required=True, help="Output TSV path")
return parser.parse_args()

def locus_key_for_row(row):
chrom = (row.get("chromosome") or "").strip()
pos = (row.get("position") or "").strip()
ref = (row.get("ref") or "").strip()
alt = (row.get("alt") or "").strip()
if not (chrom and pos and ref and alt):
return None
return f"{chrom}-{pos}-{ref}-{alt}"

def to_int(value):
try:
return int(value)
except Exception:
return 0

def parse_alleles(genotype, alleles):
if not genotype:
return []
genotype = genotype.strip()
if not genotype:
return []

# Split on common genotype separators
if "/" in genotype or "|" in genotype:
parts = [p for p in re.split(r"[\/|]", genotype) if p]
return parts

# If alleles are same length and genotype is a concat of two alleles, split evenly
if alleles:
lengths = {len(a) for a in alleles if a}
if len(lengths) == 1:
allele_len = lengths.pop()
if allele_len > 0 and len(genotype) == 2 * allele_len:
return [genotype[:allele_len], genotype[allele_len:]]

# Fallback for single-base alleles like "AA" or "AG"
if len(genotype) == 2:
return [genotype[0], genotype[1]]

return [genotype]

def main():
args = parse_args()

# stats keyed by (locus_base, allele)
stats = defaultdict(lambda: {
"allele_count": 0,
"num_homo": 0,
"num_hetero": 0,
"rsid": None,
})
locus_called_samples = defaultdict(int)

with open(args.input, "r", encoding="utf-8") as fh:
reader = csv.DictReader(fh, delimiter="\t")
for row in reader:
match_type = (row.get("match_type") or "").strip()
if match_type.lower() == "no call":
continue

locus_key = locus_key_for_row(row)
if locus_key is None:
continue

alt_count = to_int(row.get("alt_count"))
ref_count = to_int(row.get("ref_count"))
allele_number = ref_count + alt_count

if allele_number <= 0:
continue

chrom = (row.get("chromosome") or "").strip()
pos = (row.get("position") or "").strip()
ref = (row.get("ref") or "").strip()
alt = (row.get("alt") or "").strip()
locus_base = f"{chrom}-{pos}"

alleles = []
if ref:
alleles.append(ref)
if alt:
alleles.extend([a.strip() for a in alt.split(",") if a.strip()])

genotype = (row.get("genotype") or "").strip()
gt_alleles = parse_alleles(genotype, alleles)
if not gt_alleles:
continue

locus_called_samples[locus_base] += 1

rsid = row.get("rsid")
is_homo = len(gt_alleles) == 2 and gt_alleles[0] == gt_alleles[1]

for allele in gt_alleles:
key = (locus_base, allele)
entry = stats[key]
entry["allele_count"] += 1
if is_homo and allele == gt_alleles[0]:
entry["num_homo"] += 1
elif not is_homo:
entry["num_hetero"] += 1
if rsid and not entry["rsid"]:
entry["rsid"] = rsid

with open(args.output, "w", encoding="utf-8", newline="") as out_fh:
fieldnames = [
"locus_key",
"allele_count",
"allele_number",
"num_homo",
"num_hetero",
"allele_freq",
"rsid",
]
writer = csv.DictWriter(out_fh, fieldnames=fieldnames, delimiter="\t")
writer.writeheader()
for (locus_base, allele) in sorted(stats.keys()):
entry = stats[(locus_base, allele)]
called = locus_called_samples.get(locus_base, 0)
allele_number = called * 2
allele_freq = (entry["allele_count"] / allele_number) if allele_number else 0
writer.writerow({
"locus_key": f"{locus_base}-{allele}",
"allele_count": entry["allele_count"],
"allele_number": allele_number,
"num_homo": entry["num_homo"],
"num_hetero": entry["num_hetero"],
"allele_freq": f"{allele_freq:.6f}",
"rsid": entry["rsid"],
})

if __name__ == "__main__":
main()
31 changes: 30 additions & 1 deletion examples/herc2/herc2-classifier/assets/classify_herc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
# (A;G) also tends toward brown.
# (G;G) gives blue eye color ~99% of the time.

def _format_allele_label(allele):
if isinstance(allele, Alleles):
return ",".join(sorted(a.value for a in allele))
if isinstance(allele, str):
return allele
if hasattr(allele, "__iter__"):
return ",".join(sorted(str(a) for a in allele))
return str(allele)

class HERC2Classifier(GenotypeClassifier):
def classify(self, matches):
match = matches.get(rs12913832)
Expand All @@ -25,10 +34,14 @@ def classify(self, matches):
result = "No call"
genotype_sorted = None
match_type = MatchType.NO_CALL.value
ref_count = 0
alt_count = 0
else:
genotype_sorted = match.genotype_sorted
result = eye_color_map.get(genotype_sorted, "Unknown")
match_type = match.match_type.value
ref_count = match.ref_count
alt_count = match.alt_count

# Extract properties from match (source_row has the actual data from the file)
if match and match.source_row:
Expand All @@ -41,15 +54,21 @@ def classify(self, matches):
position = None

# Create report row using match properties
ref_label = _format_allele_label(rs12913832.ref)
alt_label = _format_allele_label(rs12913832.alt)
report_row = {
"participant_id": self.participant_id,
"filename": self.filename,
"rsid": rsid,
"chromosome": chromosome,
"position": position,
"ref": ref_label,
"alt": alt_label,
"genotype": genotype_sorted,
"match_type": match_type,
"eye_color": result,
"ref_count": ref_count,
"alt_count": alt_count,
}

# Write to TSV file (as a list with one row)
Expand Down Expand Up @@ -93,6 +112,8 @@ def test_brown_homozygous():
assert result[0]["rsid"] == "rs12913832"
assert result[0]["chromosome"] == "15"
assert result[0]["position"] == 28120472
assert result[0]["ref_count"] == 2
assert result[0]["alt_count"] == 0
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -104,6 +125,8 @@ def test_brown_heterozygous_unsorted():
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 1
assert result[0]["alt_count"] == 1
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -115,6 +138,8 @@ def test_blue_homozygous():
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 0
assert result[0]["alt_count"] == 2
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

Expand All @@ -126,17 +151,21 @@ def test_no_call():
assert result[0]["match_type"] == MatchType.NO_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 0
assert result[0]["alt_count"] == 0
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")

def test_unexpected_allele_c():
"""Test handling of unexpected C allele (not in reference map)."""
'Test handling of unexpected C allele (not in reference map).'
result = classify_fixture("AC")
assert len(result) == 1
assert result[0]["eye_color"] == "Unknown"
assert result[0]["genotype"] == "AC"
assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
assert result[0]["participant_id"] == "TEST_ID"
assert result[0]["filename"] == "test.txt"
assert result[0]["ref_count"] == 1
assert result[0]["alt_count"] == 1
# Cleanup
os.remove("result_HERC2_TEST_ID.tsv")
33 changes: 17 additions & 16 deletions examples/herc2/herc2-classifier/flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@ metadata:
name: herc2-classifier
version: 0.1.1
spec:
steps:
- id: herc2
uses:
source:
kind: local
path: ./
with:
participants: inputs.samplesheet
publish:
classification_result: File(result_HERC2.tsv)
population_stats: File(result_HERC2_stats.tsv)
store:
counts_sql:
kind: sql
source: classification_result
table: herc2_{run_id}
key_column: participant_id
inputs:
samplesheet:
type: List[GenotypeRecord]
steps:
- id: herc2
uses:
source:
kind: local
path: ./
with:
participants: inputs.samplesheet
publish:
classification_result: File(result_HERC2.tsv)
store:
counts_sql:
kind: sql
source: classification_result
table: herc2_{run_id}
key_column: participant_id
Loading
Loading