OpenMined · madhavajay · Feb 6, 2026 · Feb 6, 2026
diff --git a/examples/apol1/apol1-classifier/assets/aggregate_apol1_group_stats.py b/examples/apol1/apol1-classifier/assets/aggregate_apol1_group_stats.py
@@ -0,0 +1,51 @@
+import argparse
+import csv
+from collections import Counter
+
+GROUPS = ["G0/G0", "G1/G0", "G1/G1", "G2/G0", "G2/G1", "G2/G2"]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Count distinct APOL1 diploid classification groups")
+    parser.add_argument("--input", required=True, help="Input TSV from combined classifier output")
+    parser.add_argument("--output", required=True, help="Output TSV path")
+    return parser.parse_args()
+
+
+def normalize_status(apol1_status):
+    status = (apol1_status or "").strip()
+    if not status:
+        return None
+    parts = [p.strip() for p in status.split("/")]
+    if len(parts) != 2:
+        return None
+    order = {"G2": 0, "G1": 1, "G0": 2}
+    sorted_parts = sorted(parts, key=lambda p: order.get(p, 99))
+    return f"{sorted_parts[0]}/{sorted_parts[1]}"
+
+
+def main():
+    args = parse_args()
+
+    participant_status = {}
+    with open(args.input, "r", encoding="utf-8") as fh:
+        reader = csv.DictReader(fh, delimiter="\t")
+        for row in reader:
+            pid = (row.get("participant_id") or "").strip()
+            status = (row.get("apol1_status") or "").strip()
+            if not pid or not status:
+                continue
+            if pid not in participant_status:
+                participant_status[pid] = normalize_status(status)
+
+    counts = Counter(s for s in participant_status.values() if s)
+
+    with open(args.output, "w", encoding="utf-8", newline="") as out_fh:
+        writer = csv.DictWriter(out_fh, fieldnames=["classification", "count"], delimiter="\t")
+        writer.writeheader()
+        for group in GROUPS:
+            writer.writerow({"classification": group, "count": counts.get(group, 0)})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/apol1/apol1-classifier/flow.yaml b/examples/apol1/apol1-classifier/flow.yaml
@@ -17,6 +17,7 @@ spec:
       population_stats: File(result_APOL1_stats.tsv)
       classification_stats: File(result_APOL1_classification_stats.tsv)
       apol1_status: File(result_APOL1_status.tsv)
+      group_stats: File(result_APOL1_group_stats.tsv)
     store:
       counts_sql:
         kind: sql

diff --git a/examples/apol1/apol1-classifier/module.yaml b/examples/apol1/apol1-classifier/module.yaml
@@ -11,7 +11,7 @@ spec:
     kind: nextflow
     entrypoint: workflow.nf
     template: dynamic-nextflow
-    image: ghcr.io/openmined/bioscript:0.1.6
+    image: ghcr.io/openmined/bioscript:0.1.7
   inputs:
   - name: participants
     type: List[GenotypeRecord]
@@ -46,9 +46,16 @@ spec:
     format:
       kind: tsv
     path: result_APOL1_status.tsv
+  - name: group_stats
+    type: File
+    description: Absolute counts per distinct APOL1 diploid classification group
+    format:
+      kind: tsv
+    path: result_APOL1_group_stats.tsv
   parameters: []
   assets:
   - path: classify_apol1.py
   - path: aggregate_population_stats.py
   - path: aggregate_classification_stats.py
   - path: aggregate_apol1_status.py
+  - path: aggregate_apol1_group_stats.py
diff --git a/examples/apol1/apol1-classifier/workflow.nf b/examples/apol1/apol1-classifier/workflow.nf
@@ -51,11 +51,18 @@ workflow USER {
             aggregated
         )
 
+        // Count distinct diploid classification groups (G0/G0, G1/G0, etc.)
+        def group_stats_ch = aggregate_apol1_group_stats(
+            Channel.value(assetsDirPath),
+            aggregated
+        )
+
     emit:
         classification_result = aggregated
         population_stats = population_stats_ch
         classification_stats = classification_stats_ch
         apol1_status = apol1_status_ch
+        group_stats = group_stats_ch
 }
 
 process apol1_classifier {
@@ -152,3 +159,22 @@ process aggregate_apol1_status {
       --output result_APOL1_status.tsv
     """
 }
+
+process aggregate_apol1_group_stats {
+    container 'ghcr.io/openmined/bioscript:0.1.7'
+    publishDir params.results_dir, mode: 'copy', overwrite: true
+
+    input:
+        path assets_dir
+        path aggregated_results
+
+    output:
+        path "result_APOL1_group_stats.tsv"
+
+    script:
+    """
+    python3 "${assets_dir}/aggregate_apol1_group_stats.py" \
+      --input "${aggregated_results}" \
+      --output result_APOL1_group_stats.tsv
+    """
+}