diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7ec8860..1e38360 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,8 @@ jobs: strategy: fail-fast: false runs-on: ubuntu-latest + env: + SQL_DRIVER: sqlite steps: - uses: actions/checkout@v4 diff --git a/examples/empty/.tests/unit/common.py b/examples/empty/.tests/unit/common.py index 519ef24..caf1370 100644 --- a/examples/empty/.tests/unit/common.py +++ b/examples/empty/.tests/unit/common.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from subprocess import check_output +from subprocess import check_output, PIPE cmp_cmds = { @@ -28,21 +28,22 @@ def check(self, cmp_cmds=cmp_cmds): for path, subdirs, files in os.walk(self.data_path) for f in files ) - print(f"input: {input_files}") # DEBUG - # Workdir files + print(f"input_files: {input_files}") # DEBUG + # Workdir files (ignoring '.snakemake/' and 'config/' folders) workdir_files = set( (Path(path) / f).relative_to(self.workdir) for path, subdirs, files in os.walk(self.workdir) for f in files + if "/.snakemake" not in path and "/config" not in path ) - print(f"workdir: {workdir_files}") # DEBUG + print(f"workdir_files: {workdir_files}") # DEBUG # Expected files expected_files = set( (Path(path) / f).relative_to(self.expected_path) for path, subdirs, files in os.walk(self.expected_path) for f in files ) - print(f"expected: {expected_files}") # DEBUG + print(f"expected_files: {expected_files}") # DEBUG assert expected_files.issubset( workdir_files @@ -55,5 +56,6 @@ def check(self, cmp_cmds=cmp_cmds): def compare_files(self, expected_file, generated_file, cmp_cmds): check_output( cmp_cmds.get(expected_file.suffix, ["cmp"]) - + [expected_file, generated_file] + + [expected_file, generated_file], + stderr=PIPE, ) diff --git a/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/config.yaml b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/config.yaml new file mode 100644 index 0000000..765a235 --- /dev/null +++ b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/config.yaml @@ -0,0 +1,155 @@ + +# - Only tested with Phred33 quality scores + +samples: config/samples.tsv + +units: config/units.tsv + + + +############# +### READS ### +############# +trim: + trim: + activate: true + tool: adapterremoval + params: "--trimns --maxns 10 --trimqualities --minlength 30 --mask-degenerate-bases --seed 12345" + + # Ignored for SE + collapse: + activate: true + params: "--collapse-conservatively" + +derep: + extension: + activate: false + k: 16 + params: "ibb=t prefilter=0 el=100 er=100 ecc=f ecco=f ignorebadquality extendrollback=0" + + derep: + activate: true + # vsearch or seqkit + tool: seqkit + params: "" + + low_complex: + params: "entropy=0.7 entropywindow=30 entropyk=4" + + + +############# +### ALIGN ### +############# +prefilter: + taxa: "Bacteria,Archaea,Viruses" + + ref: + prok: + n_shards: 2 + path: "data/prok.{n_shard}-of-2.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal -N 1" + bt2l: False + acc2taxid: "data/prok.acc2taxid.gz" + virus: + n_shards: 1 + path: "data/virus.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/virus.acc2taxid.gz" + + filter: + saturated_reads: + activate: true + n_alns: 10 + + bam_filter: + reassign: + activate: false + params: "--iters 0 --min-read-ani 92 --min-read-count 3 --scale 0 --reference-lengths genomes.len.map" + + filter: + activate: false + params: "--min-read-ani 92 --min-read-count 3 --min-normalized-entropy 0.6 --min-normalized-gini 0.4 --min-avg-read-ani 94 --reference-lengths genomes.len.map" + + lca: + activate: false + params: "--lca-rank genus --reference-lengths genomes.len.map" + + taxonomy: + nodes: "data/taxdump/nodes.dmp" + names: "data/taxdump/names.dmp" + + metadmg: + damage: + params: "--print_length 15" + + lca: + params: "--fix_ncbi 0 --how_many 25 --weight_type 1 --edit_dist_max 10000 --lca_rank genus" + + dfit: + params: "--nopt 5 --showfits 2" + + +euk: + ref: + mitoch: + n_shards: 1 + path: "data/mitoch.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/mitoch.acc2taxid.gz" + plastid: + n_shards: 1 + path: "data/plastid.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/plastid.acc2taxid.gz" + + filter: + saturated_reads: + activate: true + n_alns: 10 + + bam_filter: + reassign: + activate: false + params: "--iters 0 --min-read-ani 92 --min-read-count 3 --scale 0" + + filter: + activate: false + params: "--min-read-ani 92 --min-read-count 3 --min-normalized-entropy 0.6 --min-normalized-gini 0.4 --min-avg-read-ani 92" + + lca: + activate: false + params: "--lca-rank genus" + + taxonomy: + nodes: "data/taxdump/nodes.dmp" + names: "data/taxdump/names.dmp" + + metadmg: + damage: + params: "--print_length 15" + + lca: + params: "--fix_ncbi 0 --how_many 15 --sim_score_low 0.95 --weight_type 0 --lca_rank genus" + + dfit: + params: "--nopt 5 --showfits 2 --seed 12345" + + +############ +## REPORT ## +############ +report: + multiqc: "--verbose --cl-config 'custom_logo: data/KU_long.png' --cl-config 'custom_logo_title: CAEG - Center for Ancient Environmental Genomics' --cl-config 'custom_logo_url: https://globe.ku.dk/research/caeg/'" + multiqc_db_url: "sqlite:///test_qc.sqlite" diff --git a/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv new file mode 100644 index 0000000..d31b036 --- /dev/null +++ b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv @@ -0,0 +1,2 @@ +sample alias group condition +HD827sonic_1 NA NA NA diff --git a/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/units.tsv b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/units.tsv new file mode 100644 index 0000000..a2cba28 --- /dev/null +++ b/examples/empty/.tests/unit/multiqc_taxon_upload/config/config/units.tsv @@ -0,0 +1,3 @@ +# Adapter list: https://gist.github.com/photocyte/3edd9401d0b13476e60f8b104c2575f8 +sample library barcode flowcell lane seq_type library_type material data machine run_n center platform adapters sample_n date +HD827sonic_1 lib1 ACGGAACAxACGAGAAC HKTG2BGXG L001 PE ds DNA data/empty_L001_R{Read}.fq.gz NDX550220 98 HYDRA_GEN ILLUMINA AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT S1 21-09-2021 diff --git a/examples/empty/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip b/examples/empty/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip new file mode 100644 index 0000000..f3515ed Binary files /dev/null and b/examples/empty/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip differ diff --git a/examples/empty/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag b/examples/empty/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag new file mode 100644 index 0000000..dd5e3f2 --- /dev/null +++ b/examples/empty/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag @@ -0,0 +1,77 @@ +INFO:DB sqlite:///test_qc.sqlite does not exist. +INFO:Reading file reports/multiqc_data.taxon.zip +INFO:Uploading report v1.33 to DB... +INFO:Adding report record to DB +INFO:Adding report metadata to DB +INFO:Parsing section multiqc_fastqc_prefilter-fastqc +INFO:Parsing section multiqc_fastqc_low_complexity-fastqc +INFO:Parsing section multiqc_fastqc_derep-fastqc +INFO:Parsing section multiqc_nonpareil_merge_lanes-nonpareil +INFO:Parsing section multiqc_fastqc_merge_lanes-fastqc +INFO:Parsing section multiqc_fastqc_trim-fastqc +INFO:Parsing section multiqc_adapter_removal +INFO:Parsing section multiqc_fastqc_raw-fastqc +INFO:Parsing section multiqc_general_stats +INFO:Parsing plot prefilter-fastqc_sequence_counts_plot +INFO:Parsing dataset prefilter-fastqc_sequence_counts_plot +INFO:Parsing plot prefilter-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset prefilter-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset prefilter-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot prefilter-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset prefilter-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot prefilter-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot low_complexity-fastqc_sequence_counts_plot +INFO:Parsing dataset low_complexity-fastqc_sequence_counts_plot +INFO:Parsing plot low_complexity-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset low_complexity-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset low_complexity-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot low_complexity-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset low_complexity-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot low_complexity-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot derep-fastqc_sequence_counts_plot +INFO:Parsing dataset derep-fastqc_sequence_counts_plot +INFO:Parsing plot derep-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset derep-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset derep-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot derep-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset derep-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot derep-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot nonpareil-table +WARNING:Plot type violin plot is not supported +INFO:Parsing plot nonpareil-redundancy-plot +INFO:Parsing dataset nonpareil-redundancy-plot_Combined +INFO:Parsing dataset nonpareil-redundancy-plot_Observed +INFO:Parsing plot merge_lanes-fastqc_sequence_counts_plot +INFO:Parsing dataset merge_lanes-fastqc_sequence_counts_plot +INFO:Parsing plot merge_lanes-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset merge_lanes-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset merge_lanes-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot merge_lanes-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset merge_lanes-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot merge_lanes-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot trim-fastqc_sequence_counts_plot +INFO:Parsing dataset trim-fastqc_sequence_counts_plot +INFO:Parsing plot trim-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset trim-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset trim-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot trim-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset trim-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot trim-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot ar_retained_plot +INFO:Parsing dataset ar_retained_plot +INFO:Parsing plot raw-fastqc_sequence_counts_plot +INFO:Parsing dataset raw-fastqc_sequence_counts_plot +INFO:Parsing plot raw-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset raw-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset raw-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot raw-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset raw-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot raw-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot general_stats_table +WARNING:Plot type violin plot is not supported diff --git a/examples/empty/.tests/unit/test_multiqc_taxon_upload.py b/examples/empty/.tests/unit/test_multiqc_taxon_upload.py new file mode 100644 index 0000000..4d2a3e0 --- /dev/null +++ b/examples/empty/.tests/unit/test_multiqc_taxon_upload.py @@ -0,0 +1,69 @@ +""" +Rule test code for unit testing of rules generated with Snakemake 9.16.4.dev3. +""" + +import os +import sys +import shutil +import tempfile +from pathlib import Path +from subprocess import check_output + +sys.path.insert(0, os.path.dirname(__file__)) + + +def test_multiqc_taxon_upload(conda_prefix): + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) / "workdir" + config_path = Path(".tests/unit/multiqc_taxon_upload/config") + data_path = Path(".tests/unit/multiqc_taxon_upload/data") + expected_path = Path(".tests/unit/multiqc_taxon_upload/expected") + + # Copy config to the temporary workdir. + shutil.copytree(config_path, workdir) + + # Copy data to the temporary workdir. + shutil.copytree(data_path, workdir, dirs_exist_ok=True) + + # Run the test job. + check_output( + [ + "python", + "-m", + "snakemake", + "stats/reports/multiqc_taxon.upload.flag", + "--snakefile", + "../../workflow/Snakefile", + "-f", + "--notemp", + "--show-failed-logs", + "-j1", + "--target-files-omit-workdir-adjustment", + "--allowed-rules", + "multiqc_taxon_upload", + "--configfile", + "config/config.yaml", + "--software-deployment-method", + "conda", + "--directory", + workdir, + ] + + conda_prefix + ) + + # Check the output byte by byte using cmp/zmp/bzcmp/xzcmp. + # To modify this behavior, you can inherit from common.OutputChecker in here + # and overwrite the method `compare_files(generated_file, expected_file), + # also see common.py. + import common + + common.OutputChecker(data_path, expected_path, workdir).check( + { + ".flag": [ + "diff", + "--ignore-matching-lines=Uploading", + "--ignore-matching-lines=tzname", + ] + } + ) diff --git a/examples/empty/.tests/unit/test_taxon_align_stats.py b/examples/empty/.tests/unit/test_taxon_align_stats.py index 83b3066..0822912 100644 --- a/examples/empty/.tests/unit/test_taxon_align_stats.py +++ b/examples/empty/.tests/unit/test_taxon_align_stats.py @@ -59,5 +59,11 @@ def test_taxon_align_stats(conda_prefix): import common common.OutputChecker(data_path, expected_path, workdir).check( - {".txt": ["diff", "--ignore-matching-lines=\\#"]} + { + ".txt": [ + "diff", + "--ignore-matching-lines=samtools", + "--ignore-matching-lines=command", + ] + } ) diff --git a/examples/empty/config/config.yaml b/examples/empty/config/config.yaml index 7dc6288..95a8564 100644 --- a/examples/empty/config/config.yaml +++ b/examples/empty/config/config.yaml @@ -152,3 +152,4 @@ euk: ############ report: multiqc: "--verbose --cl-config 'custom_logo: data/KU_long.png' --cl-config 'custom_logo_title: CAEG - Center for Ancient Environmental Genomics' --cl-config 'custom_logo_url: https://globe.ku.dk/research/caeg/'" + multiqc_db_url: "test_qc.sqlite" diff --git a/examples/euk_prok_virus/.tests/unit/common.py b/examples/euk_prok_virus/.tests/unit/common.py index 519ef24..caf1370 100644 --- a/examples/euk_prok_virus/.tests/unit/common.py +++ b/examples/euk_prok_virus/.tests/unit/common.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from subprocess import check_output +from subprocess import check_output, PIPE cmp_cmds = { @@ -28,21 +28,22 @@ def check(self, cmp_cmds=cmp_cmds): for path, subdirs, files in os.walk(self.data_path) for f in files ) - print(f"input: {input_files}") # DEBUG - # Workdir files + print(f"input_files: {input_files}") # DEBUG + # Workdir files (ignoring '.snakemake/' and 'config/' folders) workdir_files = set( (Path(path) / f).relative_to(self.workdir) for path, subdirs, files in os.walk(self.workdir) for f in files + if "/.snakemake" not in path and "/config" not in path ) - print(f"workdir: {workdir_files}") # DEBUG + print(f"workdir_files: {workdir_files}") # DEBUG # Expected files expected_files = set( (Path(path) / f).relative_to(self.expected_path) for path, subdirs, files in os.walk(self.expected_path) for f in files ) - print(f"expected: {expected_files}") # DEBUG + print(f"expected_files: {expected_files}") # DEBUG assert expected_files.issubset( workdir_files @@ -55,5 +56,6 @@ def check(self, cmp_cmds=cmp_cmds): def compare_files(self, expected_file, generated_file, cmp_cmds): check_output( cmp_cmds.get(expected_file.suffix, ["cmp"]) - + [expected_file, generated_file] + + [expected_file, generated_file], + stderr=PIPE, ) diff --git a/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/config.yaml b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/config.yaml new file mode 100644 index 0000000..765a235 --- /dev/null +++ b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/config.yaml @@ -0,0 +1,155 @@ + +# - Only tested with Phred33 quality scores + +samples: config/samples.tsv + +units: config/units.tsv + + + +############# +### READS ### +############# +trim: + trim: + activate: true + tool: adapterremoval + params: "--trimns --maxns 10 --trimqualities --minlength 30 --mask-degenerate-bases --seed 12345" + + # Ignored for SE + collapse: + activate: true + params: "--collapse-conservatively" + +derep: + extension: + activate: false + k: 16 + params: "ibb=t prefilter=0 el=100 er=100 ecc=f ecco=f ignorebadquality extendrollback=0" + + derep: + activate: true + # vsearch or seqkit + tool: seqkit + params: "" + + low_complex: + params: "entropy=0.7 entropywindow=30 entropyk=4" + + + +############# +### ALIGN ### +############# +prefilter: + taxa: "Bacteria,Archaea,Viruses" + + ref: + prok: + n_shards: 2 + path: "data/prok.{n_shard}-of-2.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal -N 1" + bt2l: False + acc2taxid: "data/prok.acc2taxid.gz" + virus: + n_shards: 1 + path: "data/virus.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/virus.acc2taxid.gz" + + filter: + saturated_reads: + activate: true + n_alns: 10 + + bam_filter: + reassign: + activate: false + params: "--iters 0 --min-read-ani 92 --min-read-count 3 --scale 0 --reference-lengths genomes.len.map" + + filter: + activate: false + params: "--min-read-ani 92 --min-read-count 3 --min-normalized-entropy 0.6 --min-normalized-gini 0.4 --min-avg-read-ani 94 --reference-lengths genomes.len.map" + + lca: + activate: false + params: "--lca-rank genus --reference-lengths genomes.len.map" + + taxonomy: + nodes: "data/taxdump/nodes.dmp" + names: "data/taxdump/names.dmp" + + metadmg: + damage: + params: "--print_length 15" + + lca: + params: "--fix_ncbi 0 --how_many 25 --weight_type 1 --edit_dist_max 10000 --lca_rank genus" + + dfit: + params: "--nopt 5 --showfits 2" + + +euk: + ref: + mitoch: + n_shards: 1 + path: "data/mitoch.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/mitoch.acc2taxid.gz" + plastid: + n_shards: 1 + path: "data/plastid.1-of-1.fas.gz" + map: + tool: bowtie2 + params: "-k 10 -L 22 -i S,1,1.15 --mp 1,1 --rdg 0,1 --rfg 0,1 --score-min L,0,-0.1 --no-unal" + bt2l: False + acc2taxid: "data/plastid.acc2taxid.gz" + + filter: + saturated_reads: + activate: true + n_alns: 10 + + bam_filter: + reassign: + activate: false + params: "--iters 0 --min-read-ani 92 --min-read-count 3 --scale 0" + + filter: + activate: false + params: "--min-read-ani 92 --min-read-count 3 --min-normalized-entropy 0.6 --min-normalized-gini 0.4 --min-avg-read-ani 92" + + lca: + activate: false + params: "--lca-rank genus" + + taxonomy: + nodes: "data/taxdump/nodes.dmp" + names: "data/taxdump/names.dmp" + + metadmg: + damage: + params: "--print_length 15" + + lca: + params: "--fix_ncbi 0 --how_many 15 --sim_score_low 0.95 --weight_type 0 --lca_rank genus" + + dfit: + params: "--nopt 5 --showfits 2 --seed 12345" + + +############ +## REPORT ## +############ +report: + multiqc: "--verbose --cl-config 'custom_logo: data/KU_long.png' --cl-config 'custom_logo_title: CAEG - Center for Ancient Environmental Genomics' --cl-config 'custom_logo_url: https://globe.ku.dk/research/caeg/'" + multiqc_db_url: "sqlite:///test_qc.sqlite" diff --git a/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv new file mode 100644 index 0000000..5994081 --- /dev/null +++ b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/samples.tsv @@ -0,0 +1,2 @@ +sample alias group condition +Lib ancient diff --git a/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/units.tsv b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/units.tsv new file mode 100644 index 0000000..4a8a376 --- /dev/null +++ b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/config/config/units.tsv @@ -0,0 +1,4 @@ +sample library flowcell lane seq_type library_type material data machine run_n sample_n date center platform adapters +Lib LVsim1 BHXXXXXXXX L001 PE ds DNA data/test_L001_R{Read}.fq.gz SIMULATED 0000 S1 2025-10-09 CAEG ILLUMINA AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +Lib LVsim1 BHXXXXXXXX L002 PE ds DNA data/test_L002_R{Read}.fq.gz SIMULATED 0000 S2 2025-10-09 CAEG ILLUMINA AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +Lib LVsim2 BHXXXXXXXX L001 PE ds DNA data/test_L003_R{Read}.fq.gz SIMULATED 0000 S3 2025-10-09 CAEG ILLUMINA AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT diff --git a/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip new file mode 100644 index 0000000..8ba7679 Binary files /dev/null and b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/data/reports/multiqc_data.taxon.zip differ diff --git a/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag new file mode 100644 index 0000000..e212f20 --- /dev/null +++ b/examples/euk_prok_virus/.tests/unit/multiqc_taxon_upload/expected/stats/reports/multiqc_taxon.upload.flag @@ -0,0 +1,161 @@ +INFO:DB sqlite:///test_qc.sqlite does not exist. +INFO:Reading file reports/multiqc_data.taxon.zip +INFO:Uploading report v1.33 to DB... +INFO:Adding report record to DB +INFO:Adding report metadata to DB +INFO:Parsing section multiqc_samtools_stats_euk-samtools +INFO:Parsing section multiqc_bowtie2_euk-bowtie2 +INFO:Parsing section multiqc_fastqc_prefilter-fastqc +INFO:Parsing section multiqc_samtools_stats_prefilter-samtools +INFO:Parsing section multiqc_bowtie2_prefilter-bowtie2 +INFO:Parsing section multiqc_fastqc_low_complexity-fastqc +INFO:Parsing section multiqc_fastqc_derep-fastqc +INFO:Parsing section multiqc_nonpareil_merge_lanes-nonpareil +INFO:Parsing section multiqc_fastqc_merge_lanes-fastqc +INFO:Parsing section multiqc_fastqc_trim-fastqc +INFO:Parsing section multiqc_adapter_removal +INFO:Parsing section multiqc_fastqc_raw-fastqc +INFO:Parsing section multiqc_general_stats +INFO:Parsing plot samtools_alignment_plot +INFO:Parsing dataset samtools_alignment_plot +INFO:Parsing plot samtools-stats-dp +WARNING:Plot type violin plot is not supported +INFO:Parsing plot bowtie2_se_plot +INFO:Parsing dataset bowtie2_se_plot +INFO:Parsing plot prefilter-fastqc_sequence_counts_plot +INFO:Parsing dataset prefilter-fastqc_sequence_counts_plot +INFO:Parsing plot prefilter-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset prefilter-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot prefilter-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset prefilter-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot prefilter-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset prefilter-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset prefilter-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot prefilter-fastqc_per_base_n_content_plot +INFO:Parsing dataset prefilter-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot prefilter-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset prefilter-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot prefilter-fastqc_overrepresented_sequences_plot +INFO:Parsing dataset prefilter-fastqc_overrepresented_sequences_plot +INFO:Parsing plot prefilter-fastqc_top_overrepresented_sequences_table +WARNING:Plot type violin plot is not supported +INFO:Parsing plot prefilter-fastqc_adapter_content_plot +INFO:Parsing dataset prefilter-fastqc_adapter_content_plot +INFO:Parsing plot prefilter-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot samtools_alignment_plot-1 +INFO:Parsing dataset samtools_alignment_plot +INFO:Parsing plot samtools-stats-dp-1 +WARNING:Plot type violin plot is not supported +INFO:Parsing plot bowtie2_se_plot-1 +INFO:Parsing dataset bowtie2_se_plot +INFO:Parsing plot low_complexity-fastqc_sequence_counts_plot +INFO:Parsing dataset low_complexity-fastqc_sequence_counts_plot +INFO:Parsing plot low_complexity-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset low_complexity-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot low_complexity-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset low_complexity-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot low_complexity-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset low_complexity-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset low_complexity-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot low_complexity-fastqc_per_base_n_content_plot +INFO:Parsing dataset low_complexity-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot-1 +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot low_complexity-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset low_complexity-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot low_complexity-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot derep-fastqc_sequence_counts_plot +INFO:Parsing dataset derep-fastqc_sequence_counts_plot +INFO:Parsing plot derep-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset derep-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot derep-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset derep-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot derep-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset derep-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset derep-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot derep-fastqc_per_base_n_content_plot +INFO:Parsing dataset derep-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot-2 +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot derep-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset derep-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot derep-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot nonpareil-table +WARNING:Plot type violin plot is not supported +INFO:Parsing plot nonpareil-redundancy-plot +INFO:Parsing dataset nonpareil-redundancy-plot_Combined +INFO:Parsing dataset nonpareil-redundancy-plot_Observed +INFO:Parsing plot merge_lanes-fastqc_sequence_counts_plot +INFO:Parsing dataset merge_lanes-fastqc_sequence_counts_plot +INFO:Parsing plot merge_lanes-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset merge_lanes-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot merge_lanes-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset merge_lanes-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot merge_lanes-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset merge_lanes-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset merge_lanes-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot merge_lanes-fastqc_per_base_n_content_plot +INFO:Parsing dataset merge_lanes-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot-3 +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot merge_lanes-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset merge_lanes-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot merge_lanes-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot trim-fastqc_sequence_counts_plot +INFO:Parsing dataset trim-fastqc_sequence_counts_plot +INFO:Parsing plot trim-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset trim-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot trim-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset trim-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot trim-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset trim-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset trim-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot trim-fastqc_per_base_n_content_plot +INFO:Parsing dataset trim-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot-4 +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot trim-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset trim-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot trim-fastqc_overrepresented_sequences_plot +INFO:Parsing dataset trim-fastqc_overrepresented_sequences_plot +INFO:Parsing plot trim-fastqc_top_overrepresented_sequences_table +WARNING:Plot type violin plot is not supported +INFO:Parsing plot trim-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot ar_retained_plot +INFO:Parsing dataset ar_retained_plot +INFO:Parsing plot ar_length_count_plot +INFO:Parsing dataset ar_length_count_plot_All +INFO:Parsing dataset ar_length_count_plot_Mate1 +INFO:Parsing dataset ar_length_count_plot_Mate2 +INFO:Parsing dataset ar_length_count_plot_Singleton +INFO:Parsing dataset ar_length_count_plot_Collapsed +INFO:Parsing dataset ar_length_count_plot_Collapsed_Truncated +INFO:Parsing dataset ar_length_count_plot_Discarded +INFO:Parsing plot raw-fastqc_sequence_counts_plot +INFO:Parsing dataset raw-fastqc_sequence_counts_plot +INFO:Parsing plot raw-fastqc_per_base_sequence_quality_plot +INFO:Parsing dataset raw-fastqc_per_base_sequence_quality_plot +INFO:Parsing plot raw-fastqc_per_sequence_quality_scores_plot +INFO:Parsing dataset raw-fastqc_per_sequence_quality_scores_plot +INFO:Parsing plot raw-fastqc_per_sequence_gc_content_plot +INFO:Parsing dataset raw-fastqc_per_sequence_gc_content_plot_Percentages +INFO:Parsing dataset raw-fastqc_per_sequence_gc_content_plot_Counts +INFO:Parsing plot raw-fastqc_per_base_n_content_plot +INFO:Parsing dataset raw-fastqc_per_base_n_content_plot +INFO:Parsing plot fastqc_sequence_length_distribution_plot-5 +INFO:Parsing dataset fastqc_sequence_length_distribution_plot +INFO:Parsing plot raw-fastqc_sequence_duplication_levels_plot +INFO:Parsing dataset raw-fastqc_sequence_duplication_levels_plot +INFO:Parsing plot raw-fastqc_adapter_content_plot +INFO:Parsing dataset raw-fastqc_adapter_content_plot +INFO:Parsing plot raw-fastqc-status-check-heatmap +WARNING:Plot type heatmap is not supported +INFO:Parsing plot general_stats_table +WARNING:Plot type violin plot is not supported diff --git a/examples/euk_prok_virus/.tests/unit/test_multiqc_taxon_upload.py b/examples/euk_prok_virus/.tests/unit/test_multiqc_taxon_upload.py new file mode 100644 index 0000000..4d2a3e0 --- /dev/null +++ b/examples/euk_prok_virus/.tests/unit/test_multiqc_taxon_upload.py @@ -0,0 +1,69 @@ +""" +Rule test code for unit testing of rules generated with Snakemake 9.16.4.dev3. +""" + +import os +import sys +import shutil +import tempfile +from pathlib import Path +from subprocess import check_output + +sys.path.insert(0, os.path.dirname(__file__)) + + +def test_multiqc_taxon_upload(conda_prefix): + + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) / "workdir" + config_path = Path(".tests/unit/multiqc_taxon_upload/config") + data_path = Path(".tests/unit/multiqc_taxon_upload/data") + expected_path = Path(".tests/unit/multiqc_taxon_upload/expected") + + # Copy config to the temporary workdir. + shutil.copytree(config_path, workdir) + + # Copy data to the temporary workdir. + shutil.copytree(data_path, workdir, dirs_exist_ok=True) + + # Run the test job. + check_output( + [ + "python", + "-m", + "snakemake", + "stats/reports/multiqc_taxon.upload.flag", + "--snakefile", + "../../workflow/Snakefile", + "-f", + "--notemp", + "--show-failed-logs", + "-j1", + "--target-files-omit-workdir-adjustment", + "--allowed-rules", + "multiqc_taxon_upload", + "--configfile", + "config/config.yaml", + "--software-deployment-method", + "conda", + "--directory", + workdir, + ] + + conda_prefix + ) + + # Check the output byte by byte using cmp/zmp/bzcmp/xzcmp. + # To modify this behavior, you can inherit from common.OutputChecker in here + # and overwrite the method `compare_files(generated_file, expected_file), + # also see common.py. + import common + + common.OutputChecker(data_path, expected_path, workdir).check( + { + ".flag": [ + "diff", + "--ignore-matching-lines=Uploading", + "--ignore-matching-lines=tzname", + ] + } + ) diff --git a/examples/euk_prok_virus/.tests/unit/test_taxon_align_stats.py b/examples/euk_prok_virus/.tests/unit/test_taxon_align_stats.py index 155b66a..522db7a 100644 --- a/examples/euk_prok_virus/.tests/unit/test_taxon_align_stats.py +++ b/examples/euk_prok_virus/.tests/unit/test_taxon_align_stats.py @@ -59,5 +59,11 @@ def test_taxon_align_stats(conda_prefix): import common common.OutputChecker(data_path, expected_path, workdir).check( - {".txt": ["diff", "--ignore-matching-lines=\\#"]} + { + ".txt": [ + "diff", + "--ignore-matching-lines=samtools", + "--ignore-matching-lines=command", + ] + } ) diff --git a/examples/euk_prok_virus/config/config.yaml b/examples/euk_prok_virus/config/config.yaml index 7dc6288..95a8564 100644 --- a/examples/euk_prok_virus/config/config.yaml +++ b/examples/euk_prok_virus/config/config.yaml @@ -152,3 +152,4 @@ euk: ############ report: multiqc: "--verbose --cl-config 'custom_logo: data/KU_long.png' --cl-config 'custom_logo_title: CAEG - Center for Ancient Environmental Genomics' --cl-config 'custom_logo_url: https://globe.ku.dk/research/caeg/'" + multiqc_db_url: "test_qc.sqlite" diff --git a/examples/make_dags.sh b/examples/make_dags.sh index 31a42e4..46e2c14 100755 --- a/examples/make_dags.sh +++ b/examples/make_dags.sh @@ -13,7 +13,7 @@ do snakemake $SNAKEMAKE_OPTS --dag | dot -Tsvg > dag.svg if [ -d .tests/unit/ ]; then - pytest -rxXs -p no:cacheprovider .tests/unit/ + pytest -r a -p no:cacheprovider .tests/unit/ fi cd ../ done diff --git a/workflow/Snakefile b/workflow/Snakefile index 9760c44..a9036d2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -152,7 +152,7 @@ rule multiqc_trim_upload: input: zip=rules.multiqc_trim.output.data, output: - touch("stats/reports/multiqc_trim.upload.flag"), + "stats/reports/multiqc_trim.upload.flag", log: "logs/reports/multiqc_trim.upload.log", params: @@ -165,7 +165,10 @@ rule multiqc_trim_upload: mem=lambda w, attempt: f"{1* attempt} GiB", runtime=lambda w, attempt: f"{30* attempt} m", shell: - "python {base_dir}/scripts/multiqc_upload.py --db-upload {input.zip} --db-url {params.db_url} --log-level INFO 2> {log}" + """ + python {base_dir}/scripts/multiqc_upload.py --db-upload {input.zip} --db-url {params.db_url} --log-level INFO 2> {log}; + cut -d ":" -f 4- {log} > {output[0]}; + """ rule target_trim: diff --git a/workflow/scripts/db.py b/workflow/scripts/db.py index 9dc388f..881894a 100644 --- a/workflow/scripts/db.py +++ b/workflow/scripts/db.py @@ -19,15 +19,15 @@ def delete_report(session, report_id): # Delete plot data - logging.debug(f"Deleting report {report.id} from table 'plot_data'.") + logging.debug(f"Deleting report {report_id} from table 'plot_data'.") session.query(PlotData).filter(PlotData.report_id == report_id).delete() session.commit() # Delete plot category - logging.debug(f"Deleting report {report.id} from table 'plot_category'.") + logging.debug(f"Deleting report {report_id} from table 'plot_category'.") session.query(PlotCategory).filter(PlotCategory.report_id == report_id).delete() session.commit() # Delete plot config - logging.debug(f"Deleting report {report.id} from table 'plot_config'.") + logging.debug(f"Deleting report {report_id} from table 'plot_config'.") session.query(PlotConfig).filter( PlotConfig.config_id.in_( session.query(PlotConfig.config_id) @@ -42,11 +42,11 @@ def delete_report(session, report_id): ).delete(synchronize_session="fetch") session.commit() # Delete sample data - logging.debug(f"Deleting report {report.id} from table 'sample_data'.") + logging.debug(f"Deleting report {report_id} from table 'sample_data'.") session.query(SampleData).filter(SampleData.report_id == report_id).delete() session.commit() # Delete sample data type - logging.debug(f"Deleting report {report.id} from table 'sample_data_type'.") + logging.debug(f"Deleting report {report_id} from table 'sample_data_type'.") session.query(SampleDataType).filter( SampleDataType.sample_data_type_id.in_( session.query(SampleDataType.sample_data_type_id) @@ -56,17 +56,18 @@ def delete_report(session, report_id): ).delete(synchronize_session="fetch") session.commit() # Delete report metadata - logging.debug(f"Deleting report {report.id} from table 'report_meta'.") + logging.debug(f"Deleting report {report_id} from table 'report_meta'.") session.query(ReportMeta).filter(ReportMeta.report_id == report_id).delete() session.commit() # Delete sample - logging.debug(f"Deleting report {report.id} from table 'sample'.") + logging.debug(f"Deleting report {report_id} from table 'sample'.") session.query(Sample).filter(Sample.report_id == report_id).delete() session.commit() # Delete report - logging.debug(f"Deleting report {report.id} from table 'report'.") + logging.debug(f"Deleting report {report_id} from table 'report'.") session.query(Report).filter(Report.report_id == report_id).delete() session.commit() + session.expunge_all() def upload_report(engine, report_data, force=False): @@ -237,6 +238,8 @@ def upload_report(engine, report_data, force=False): plot_config = copy.deepcopy(plot_data.get("config", plot_data["pconfig"])) for dst_idx, dataset in enumerate(plot_data["datasets"]): + dataset_id = dataset["uid"] + logging.info(f"Parsing dataset {dataset_id}") dls = None dataset_name = None if "data_labels" in plot_config and dst_idx < len( @@ -259,7 +262,7 @@ def upload_report(engine, report_data, force=False): session.query(PlotConfig) .filter( PlotConfig.config_type == plot_data["plot_type"], - PlotConfig.config_name == plot_id, + PlotConfig.config_name == dataset_id, PlotConfig.config_dataset == dataset_name, ) .first() @@ -270,7 +273,7 @@ def upload_report(engine, report_data, force=False): logging.debug("Adding plot config to DB") plot_config_record = PlotConfig( config_type=plot_data["plot_type"], - config_name=plot_id, + config_name=dataset_id, config_dataset=dataset_name, data=json.dumps(plot_config), ) @@ -371,7 +374,7 @@ def upload_report(engine, report_data, force=False): { x: y for x, y in list(line_data.items()) - if x not in ["data"] + if x not in ["data", "pairs"] } ) if plot_category: diff --git a/workflow/scripts/multiqc_upload.py b/workflow/scripts/multiqc_upload.py index f070b49..9b3e771 100644 --- a/workflow/scripts/multiqc_upload.py +++ b/workflow/scripts/multiqc_upload.py @@ -109,14 +109,24 @@ def main(): # Create SQLAlchemy engine import os - from urllib.parse import urlparse + from furl import furl from sqlalchemy import create_engine + from sqlalchemy.engine import URL from sqlalchemy_utils import database_exists, create_database, drop_database - url = urlparse(args.db_url) - username = os.environ.get("SQL_USER", url.username) - password = os.environ.get("SQL_PASSWORD", url.password) - engine = create_engine(url._replace(netloc=f'{username}:{password}@{url.hostname}:{url.port or 5432}').geturl()) + if args.db_url: + + url = furl(args.db_url) + + url_config = { + "drivername": os.getenv("SQL_DRIVER", url.scheme), + "username": os.getenv("PGUSER", url.username), + "password": os.getenv("PGPASSWORD", url.password), + "host": os.getenv("PGHOST", url.host), + "port": os.getenv("PGPORT", url.port), + "database": os.getenv("PGDATABASE", str(url.path)), + } + engine = create_engine(URL.create(**url_config)) # Checking if DB exists if database_exists(engine.url):