Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import csv
from collections import Counter

GROUPS = ["G0/G0", "G1/G0", "G1/G1", "G2/G0", "G2/G1", "G2/G2"]


def parse_args():
parser = argparse.ArgumentParser(description="Count distinct APOL1 diploid classification groups")
parser.add_argument("--input", required=True, help="Input TSV from combined classifier output")
parser.add_argument("--output", required=True, help="Output TSV path")
return parser.parse_args()


def normalize_status(apol1_status):
status = (apol1_status or "").strip()
if not status:
return None
parts = [p.strip() for p in status.split("/")]
if len(parts) != 2:
return None
order = {"G2": 0, "G1": 1, "G0": 2}
sorted_parts = sorted(parts, key=lambda p: order.get(p, 99))
return f"{sorted_parts[0]}/{sorted_parts[1]}"


def main():
args = parse_args()

participant_status = {}
with open(args.input, "r", encoding="utf-8") as fh:
reader = csv.DictReader(fh, delimiter="\t")
for row in reader:
pid = (row.get("participant_id") or "").strip()
status = (row.get("apol1_status") or "").strip()
if not pid or not status:
continue
if pid not in participant_status:
participant_status[pid] = normalize_status(status)

counts = Counter(s for s in participant_status.values() if s)

with open(args.output, "w", encoding="utf-8", newline="") as out_fh:
writer = csv.DictWriter(out_fh, fieldnames=["classification", "count"], delimiter="\t")
writer.writeheader()
for group in GROUPS:
writer.writerow({"classification": group, "count": counts.get(group, 0)})


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions examples/apol1/apol1-classifier/flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ spec:
population_stats: File(result_APOL1_stats.tsv)
classification_stats: File(result_APOL1_classification_stats.tsv)
apol1_status: File(result_APOL1_status.tsv)
group_stats: File(result_APOL1_group_stats.tsv)
store:
counts_sql:
kind: sql
Expand Down
9 changes: 8 additions & 1 deletion examples/apol1/apol1-classifier/module.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
kind: nextflow
entrypoint: workflow.nf
template: dynamic-nextflow
image: ghcr.io/openmined/bioscript:0.1.6
image: ghcr.io/openmined/bioscript:0.1.7
inputs:
- name: participants
type: List[GenotypeRecord]
Expand Down Expand Up @@ -46,9 +46,16 @@ spec:
format:
kind: tsv
path: result_APOL1_status.tsv
- name: group_stats
type: File
description: Absolute counts per distinct APOL1 diploid classification group
format:
kind: tsv
path: result_APOL1_group_stats.tsv
parameters: []
assets:
- path: classify_apol1.py
- path: aggregate_population_stats.py
- path: aggregate_classification_stats.py
- path: aggregate_apol1_status.py
- path: aggregate_apol1_group_stats.py
26 changes: 26 additions & 0 deletions examples/apol1/apol1-classifier/workflow.nf
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,18 @@ workflow USER {
aggregated
)

// Count distinct diploid classification groups (G0/G0, G1/G0, etc.)
def group_stats_ch = aggregate_apol1_group_stats(
Channel.value(assetsDirPath),
aggregated
)

emit:
classification_result = aggregated
population_stats = population_stats_ch
classification_stats = classification_stats_ch
apol1_status = apol1_status_ch
group_stats = group_stats_ch
}

process apol1_classifier {
Expand Down Expand Up @@ -152,3 +159,22 @@ process aggregate_apol1_status {
--output result_APOL1_status.tsv
"""
}

process aggregate_apol1_group_stats {
container 'ghcr.io/openmined/bioscript:0.1.7'
publishDir params.results_dir, mode: 'copy', overwrite: true

input:
path assets_dir
path aggregated_results

output:
path "result_APOL1_group_stats.tsv"

script:
"""
python3 "${assets_dir}/aggregate_apol1_group_stats.py" \
--input "${aggregated_results}" \
--output result_APOL1_group_stats.tsv
"""
}
Loading
Loading