From e381fb7ed9c3d5425b1fa148c7e8489d848f3271 Mon Sep 17 00:00:00 2001 From: patruong Date: Tue, 10 Oct 2023 18:26:17 +0200 Subject: [PATCH 01/11] added functions for parsing hyphy simulated data and parsing and reformatting newick trees with more than 2 branches for difFUBAR analysis --- src/shared/shared.jl | 209 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 167 insertions(+), 42 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 7039c86..85d0dc7 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -3,7 +3,7 @@ function generate_tag_stripper(tags) function strip_tags_from_name(s::String) for t in tags - s = replace(s, t=>"") + s = replace(s, t => "") end return s end @@ -16,10 +16,10 @@ end Takes a nexus file from FigTree, where branches have been colored. Replaces all color tags with group tags that can be used in the models. Can add custom labels too. Should consider an entire custom dictionary as well in future. """ -function import_colored_figtree_nexus_as_tagged_tree(fname; custom_labels = String[]) - start_substr = "[&R] "; - lines = readlines(fname); - treeline = lines[findfirst([occursin(start_substr,l) for l in lines])]; +function import_colored_figtree_nexus_as_tagged_tree(fname; custom_labels=String[]) + start_substr = "[&R] " + lines = readlines(fname) + treeline = lines[findfirst([occursin(start_substr, l) for l in lines])] st = findfirst(start_substr, treeline)[end] treestr = treeline[st+1:end] R = r"\[\&\!color\=\#\w*\]" @@ -40,7 +40,7 @@ function import_colored_figtree_nexus_as_tagged_tree(fname; custom_labels = Stri end end for k in keys(d) - treestr = replace(treestr,k=>d[k]) + treestr = replace(treestr, k => d[k]) end tag_colors = [k[10:end-1] for k in keys(d)] @@ -51,6 +51,131 @@ end export import_colored_figtree_nexus_as_tagged_tree +######################################################## +#Function for grouped Newick to DifFUBAR analysis format +######################################################## + +function import_hyphy_simulated_FASTA(file_name) + # Import data generated using https://github.com/veg/hyphy-analyses/tree/master/SimulateMG94 + sequence_names = [] + sequences = [] + + open(file_name, "r") do file + current_sequence = "" + + for line in eachline(file) + if length(line) == 0 + break + else + if line[1] == '>' + # Store the current sequence and reset for the next entry + if !isempty(current_sequence) + push!(sequences, current_sequence) + end + push!(sequence_names, line[2:end]) + current_sequence = "" + else + # Append the line to the current sequence + current_sequence *= line + end + end + end + + # Store the last sequence after the loop ends (if any) + if !isempty(current_sequence) + push!(sequences, current_sequence) + end + end + sequence_names = [string(seq_name) for seq_name in sequence_names] + sequences = [string(seq) for seq in sequences] + + return sequence_names, sequences +end + +export import_hyphy_simulated_FASTA + +function replace_newick_tags(treestr) + # This function replaces whatever is in {} in a newick tree to G1...Gn + + pattern = r"\{([^}]+)\}" + unique_tags = Set{String}() + for match in eachmatch(pattern, treestr) + println(match) + push!(unique_tags, match.match) + end + unique_tags = collect(unique_tags) + num_unique_tags = length(unique_tags) + group_tags = ["{G$index}" for (index, _) in enumerate(unique_tags)] + + if length(unique_tags) != length(group_tags) + throw(ArgumentError("The number of unique tags must be equal to the number of group tags.")) + end + + tag_mapping = Dict{String,String}() + + for (old_tag, new_tag) in zip(unique_tags, group_tags) + tag_mapping[old_tag] = "$new_tag" + end + + for (old_tag, new_tag) in tag_mapping + treestr = replace(treestr, old_tag => new_tag) + end + + return treestr, group_tags, unique_tags +end + + +# Define a function to generate distinct hexadecimal color codes +function generate_hex_colors(num_colors) + # Generates an arbitrary number of hex-color to match group_tags + colors = [] + for i in 1:num_colors + r = rand(0:255) + g = rand(0:255) + b = rand(0:255) + hex_color = "#" * string(r, base=16, pad=2) * string(g, base=16, pad=2) * string(b, base=16, pad=2) + push!(colors, hex_color) + end + colors_string = [string(color) for color in colors] + return colors_string +end + + +function import_grouped_label_tree(tree_file) + # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag + tree = read_newick_tree(tree_file) + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) + tag_colors = generate_hex_colors(length(original_tags)) + return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors +end + +export import_grouped_label_tree + + +function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) + # If there are more than 2 tags in the newick tree, we split up the tags to the tags to be analyzed and the tags to be removed (placed as background) + analysis_tags = tags[tag_pos] + analysis_tag_colors = tag_colors[tag_pos] + remove_tags = tags[filter(i -> i ∉ tag_pos, eachindex(tags))] + return analysis_tags, analysis_tag_colors, remove_tags +end +export select_analysis_tags_from_newick_tree + +function remove_tags_from_newick_tree(treestring, tags) + # Cleans Newick tree by removing tags from tree before analysis step. Select subset of groups the rest is background. + new_tree = treestring + for tag in tags + new_tree = replace(new_tree, tag => "") + end + return new_tree +end +export remove_tags_from_newick_tree + + + + + #TODO: Make this first optimize a nuc model, then optimize a codon model starting from the nuc model estimates, fixing the GTR matrix """ @@ -59,47 +184,47 @@ export import_colored_figtree_nexus_as_tagged_tree Optimizes the MG94+F3x4 model on a tree, given a set of sequences and a tree. Returns the optimized tree, alpha, beta, nuc_matrix, F3x4, and eq_freqs. The leaf_name_transform kwarg can be used to transform the leaf names in the tree to match the seqnames. """ -function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform = x -> x, genetic_code = MolecularEvolution.universal_code) - +function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code) + #Count F3x4 frequencies from the seqs, and estimate codon freqs from this f3x4 = MolecularEvolution.count_F3x4(seqs) eq_freqs = MolecularEvolution.F3x4_eq_freqs(f3x4) #Set up a codon partition (will default to Universal genetic code) - initial_partition = CodonPartition(Int64(length(seqs[1])/3), code = genetic_code) + initial_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) initial_partition.state .= eq_freqs - populate_tree!(tree,initial_partition,seqnames,seqs, leaf_name_transform = leaf_name_transform) + populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) #We'll use the empirical F3x4 freqs, fixed MG94 alpha=1, and optimize the nuc parameters and MG94 beta #Note: the nuc rates are confounded with alpha initial_params = ( - rates = positive(ones(6)), #rates must be non-negative - beta = positive(1.0) + rates=positive(ones(6)), #rates must be non-negative + beta=positive(1.0) ) flat_initial_params, unflatten = value_flatten(initial_params) #See ParameterHandling.jl docs num_params = length(flat_initial_params) - function build_model_vec(p; F3x4 = f3x4, alpha = 1.0) + function build_model_vec(p; F3x4=f3x4, alpha=1.0) #Need to pass through genetic code here! #If you run into numerical issues with DiagonalizedCTMC, switch to GeneralCTMC instead - return GeneralCTMC(MolecularEvolution.MG94_F3x4(alpha, p.beta, reversibleQ(p.rates,ones(4)), F3x4, genetic_code = genetic_code)) + return GeneralCTMC(MolecularEvolution.MG94_F3x4(alpha, p.beta, reversibleQ(p.rates, ones(4)), F3x4, genetic_code=genetic_code)) end - function objective(params::NamedTuple; tree = tree, eq_freqs = eq_freqs) - return -log_likelihood!(tree,build_model_vec(params)) + function objective(params::NamedTuple; tree=tree, eq_freqs=eq_freqs) + return -log_likelihood!(tree, build_model_vec(params)) end opt = Opt(:LN_BOBYQA, num_params) - min_objective!(opt, (x,y) -> (objective ∘ unflatten)(x)) + min_objective!(opt, (x, y) -> (objective ∘ unflatten)(x)) lower_bounds!(opt, [-5.0 for i in 1:num_params]) upper_bounds!(opt, [5.0 for i in 1:num_params]) xtol_rel!(opt, 1e-12) - _,mini,_ = NLopt.optimize(opt, flat_initial_params) + _, mini, _ = NLopt.optimize(opt, flat_initial_params) final_params = unflatten(mini) - + #tree, alpha, beta, nuc_matrix, F3x4, eq_freqs - return tree,1.0,final_params.beta,reversibleQ(final_params.rates,ones(4)),f3x4,eq_freqs + return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs end @@ -110,31 +235,31 @@ end ########################################## #This should be in the main repo -function LDA_gibbs_track_allocation_vec(conditionals::Array{Float64,2}, alpha::Float64; iters = 10000) +function LDA_gibbs_track_allocation_vec(conditionals::Array{Float64,2}, alpha::Float64; iters=10000) grid_size, num_sites = size(conditionals) #This should instead track an integer allocation vec per MCMC iteration - will be smaller - alloc_grid = zeros(Int64,(grid_size,num_sites)) + alloc_grid = zeros(Int64, (grid_size, num_sites)) φ = zeros(grid_size) θ = ones(grid_size) ./ grid_size v = zeros(grid_size) θsum = zeros(grid_size) burnin::Int = div(iters, 5) - for iter=1:iters + for iter = 1:iters φ .= alpha - for i=1:num_sites - @simd for j=1:grid_size - @inbounds v[j] = θ[j]*conditionals[j,i] - end - samp = sample(1:grid_size,Weights(v)) + for i = 1:num_sites + @simd for j = 1:grid_size + @inbounds v[j] = θ[j] * conditionals[j, i] + end + samp = sample(1:grid_size, Weights(v)) φ[samp] += 1.0 if iter > burnin - alloc_grid[samp,i] += 1 + alloc_grid[samp, i] += 1 end end θ .= rand(Dirichlet(φ)) if iter > burnin #NOTE: test without simd and inbounds macros - @simd for j=1:grid_size + @simd for j = 1:grid_size @inbounds θsum[j] += θ[j] end end @@ -155,9 +280,9 @@ end #Sets up model memoization - could probably do this a bit more generally function MG94_cacher(code) d = Dict{Any,DiagonalizedCTMC}() - function cached_model(args...; genetic_code = code) - if !haskey(d,args) - d[args] = DiagonalizedCTMC(MolecularEvolution.MG94_F3x4(args..., genetic_code = genetic_code)) + function cached_model(args...; genetic_code=code) + if !haskey(d, args) + d[args] = DiagonalizedCTMC(MolecularEvolution.MG94_F3x4(args..., genetic_code=genetic_code)) end return d[args] end @@ -170,9 +295,9 @@ function MolecularEvolution.populate_tree!( starting_message::Vector{<:Partition}, names, data; - init_all_messages = true, - tolerate_missing = 1, #0 = error if missing; 1 = warn and set to missing data; 2 = set to missing data - leaf_name_transform = x -> x + init_all_messages=true, + tolerate_missing=1, #0 = error if missing; 1 = warn and set to missing data; 2 = set to missing data + leaf_name_transform=x -> x ) if init_all_messages internal_message_init!(tree, starting_message) @@ -201,18 +326,18 @@ function MolecularEvolution.populate_tree!( starting_partition::Partition, names, data; - init_all_messages = true, - tolerate_missing = 1, - leaf_name_transform = x -> x + init_all_messages=true, + tolerate_missing=1, + leaf_name_transform=x -> x ) MolecularEvolution.populate_tree!( tree, [starting_partition], names, data, - init_all_messages = init_all_messages, - tolerate_missing = tolerate_missing, - leaf_name_transform = leaf_name_transform + init_all_messages=init_all_messages, + tolerate_missing=tolerate_missing, + leaf_name_transform=leaf_name_transform ) end From cbf2fc8041df431e5cd85a051d87ed94da36ccc4 Mon Sep 17 00:00:00 2001 From: patruong Date: Tue, 5 Dec 2023 15:00:33 +0100 Subject: [PATCH 02/11] different prune algos --- src/difFUBAR/difFUBAR.jl | 1349 ++++++++++++++++++++++++++++++++++---- 1 file changed, 1214 insertions(+), 135 deletions(-) diff --git a/src/difFUBAR/difFUBAR.jl b/src/difFUBAR/difFUBAR.jl index a1a563c..75f6206 100644 --- a/src/difFUBAR/difFUBAR.jl +++ b/src/difFUBAR/difFUBAR.jl @@ -12,6 +12,9 @@ #Here, untagged comes after the tags + +# added for parallellizing difFUBAR_grid + function model_ind(str::String, tags::Vector{String}) ind = length(tags) + 1 for (i, t) in enumerate(tags) @@ -226,6 +229,7 @@ function difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foregr num_groups = length(tags) is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + #is_background = false tensor_dims = 1 + num_groups + is_background function add_to_each_element(vec_of_vec, elems) @@ -250,23 +254,36 @@ function difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foregr verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + for (row_ind, cp) in enumerate(codon_param_vec) alpha = cp[1] omegas = cp[2:end] tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + #print("felsenstein") felsenstein!(tree, tagged_models) #This combine!() is needed because the current site_LLs function applies to a partition #And after a felsenstein pass, you don't have the eq freqs factored in. #We could make a version of log_likelihood() that returns the partitions instead of just the sum + #print("combine") combine!.(tree.message, tree.parent_message) - + #println(length(tree.message[1])) + #print("log_con_like_matrix") log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! verbosity > 0 && if mod(row_ind, 500) == 1 print(round(100 * row_ind / length(codon_param_vec)), "% ") flush(stdout) end end + verbosity > 0 && println() con_lik_matrix = zeros(size(log_con_lik_matrix)) @@ -278,175 +295,1237 @@ function difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foregr return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds end -function difFUBAR_sample(con_lik_matrix, iters; verbosity=1) - verbosity > 0 && println("Step 4: Running Gibbs sampler to infer site categories.") - alloc_grid, theta = LDA_gibbs_track_allocation_vec(con_lik_matrix, 0.1, iters=iters) - return alloc_grid, theta -end +#foreground_grid and background_grid control the number of categories below 1.0 +function difFUBAR_grid_pruned_1(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) -function difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=1, sites_to_plot=nothing, exports=true) - grid_size, num_sites = size(alloc_grid) + cached_model = MG94_cacher(code) - r(s) = round(s, digits=4) + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end - detected_sites = Int64[] - group1_volumes = Vector{Float64}[] - group2_volumes = Vector{Float64}[] - alpha_volumes = Vector{Float64}[] - detections = Vector{Float64}[] #legacy name - now includes all 4 "relevant" site posteriors - param_means = Vector{Float64}[] + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 - ω1 = [c[2] for c in codon_param_vec] - ω2 = [c[3] for c in codon_param_vec] - alphas = [c[1] for c in codon_param_vec] - ω1_greater_filt = ω1 .> ω2 - ω2_greater_filt = ω2 .> ω1 - ω1_pos_filt = ω1 .> 1.0 - ω2_pos_filt = ω2 .> 1.0 + num_groups = length(tags) + #is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + is_background = false + tensor_dims = 1 + num_groups + is_background - verbosity > 0 && println("Step 5: Tabulating and plotting. Detected sites:") - for site in 1:num_sites - ω1_greater_posterior = sum(alloc_grid[ω1_greater_filt, site]) / sum(alloc_grid[:, site]) - ω2_greater_posterior = sum(alloc_grid[ω2_greater_filt, site]) / sum(alloc_grid[:, site]) - ω1_pos_posterior = sum(alloc_grid[ω1_pos_filt, site]) / sum(alloc_grid[:, site]) - ω2_pos_posterior = sum(alloc_grid[ω2_pos_filt, site]) / sum(alloc_grid[:, site]) - detecs = [ω1_greater_posterior, ω2_greater_posterior, ω1_pos_posterior, ω2_pos_posterior] + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end - site_counts_ω1 = collapse_counts(ω1, alloc_grid[:, site], cases=omegagrid) - site_counts_ω2 = collapse_counts(ω2, alloc_grid[:, site], cases=omegagrid) - site_counts_alphas = collapse_counts(alphas, alloc_grid[:, site], cases=alphagrid) + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec - mean_alpha = sum(site_counts_alphas .* alphagrid) - mean_ω1 = sum(site_counts_ω1 .* omegagrid) - mean_ω2 = sum(site_counts_ω2 .* omegagrid) + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) - push!(detections, detecs) - push!(param_means, [mean_alpha, mean_ω1, mean_ω2]) - push!(group1_volumes, site_counts_ω1) - push!(group2_volumes, site_counts_ω2) - push!(alpha_volumes, site_counts_alphas) + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") - if maximum(detecs) > pos_thresh - verbosity > 0 && print("Site $(site) - ") - verbosity > 0 && print("P(ω1 > ω2):", ω1_greater_posterior) - verbosity > 0 && print("; P(ω2 > ω1):", ω2_greater_posterior) - verbosity > 0 && print("; P(ω1 > 1):", ω1_pos_posterior) - verbosity > 0 && println("; P(ω2 > 1):", ω2_pos_posterior) - push!(detected_sites, site) + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + pure_subclades = FelNode[] + # Traverses the tree recursively with a dfs whilst pushing roots of pure subclades to list^ + function find_pure_subclades(node) + # Get the index of the node's tag + tag_ind_of_node = model_ind(node.name, tags) + + # If the node is a leaf, it's pure + if length(node.children) == 0 + return true, tag_ind_of_node end - end - #Exporting site data - df = DataFrame() - df[!, "Codon Sites"] = [1:num_sites;] - df[!, "P(ω1 > ω2)"] = [d[1] for d in detections] - df[!, "P(ω2 > ω1)"] = [d[2] for d in detections] - df[!, "P(ω1 > 1)"] = [d[3] for d in detections] - df[!, "P(ω2 > 1)"] = [d[4] for d in detections] - df[!, "mean(α)"] = [d[1] for d in param_means] - df[!, "mean(ω1)"] = [d[2] for d in param_means] - df[!, "mean(ω2)"] = [d[3] for d in param_means] + children_is_pure_and_tag = [] + pure_children = FelNode[] - verbosity > 0 && println("\nIf exports = true, writing results for all sites to CSV: " * analysis_name * "_posteriors.csv") - exports && CSV.write(analysis_name * "_posteriors.csv", df) + for child in node.children + child_is_pure, tag_ind = find_pure_subclades(child) + if child_is_pure + push!(pure_children, child) + end + push!(children_is_pure_and_tag, (child_is_pure, tag_ind)) + end - sites = [1:num_sites;] + # Get the index of the node's first child's tag + tag_ind_of_first_child = children_is_pure_and_tag[1][2] + + # This is the case where the subclade starting at node is pure + if all(x == (true, tag_ind_of_first_child) for x in children_is_pure_and_tag) + if tag_ind_of_node != tag_ind_of_first_child + # The purity is broken at this node + push!(pure_subclades, node) + return false, tag_ind_of_node + end + # The purity is not broken at this node + return true, tag_ind_of_node + end - #Select the sites that will get plotted, in case you want to customize this. - if isnothing(sites_to_plot) - sites_to_plot = detected_sites + # This is the case where some child has mixed tags or the children are pure with regards to different tags + for pure_child in pure_children + if length(pure_child.children) == 0 + # We don't want to push leaves into pure_subclades + continue + end + push!(pure_subclades, pure_child) + end + return false, tag_ind_of_node end - if length(sites_to_plot) == 0 - verbosity > 0 && println("No sites detected above threshold.") - elseif exports - verbosity > 0 && println("Plotting alpha and omega distributions. If exports = true, saved as " * analysis_name * "_violin_*.pdf") + time_elapsed = @elapsed find_pure_subclades(tree) + println("Max prune " * string(time_elapsed)) + + cached_messages = Dict() + cached_tag_inds = Dict() + for x in pure_subclades + cached_messages[x] = Dict() + cached_tag_inds[x] = model_ind(x.children[1].name, tags) + parent = x.parent + x.parent = nothing + for cp in codon_param_vec + alpha = cp[1] + omegas = cp[2:end] + + relevant_omega = omegas[cached_tag_inds[x]] + + if haskey(cached_messages[x], (alpha, relevant_omega)) + continue + end + + models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + felsenstein!(x, models) + cached_messages[x][(alpha, relevant_omega)] = deepcopy(x.message) + end + x.parent = parent + x.children = FelNode[] + end - #Assumes alpha and omega grids are the same!? Currently enforced by args passed into difFUBAR_grid - #Maybe this is ok - grd = round.(omegagrid, digits=3) + for (row_ind, cp) in enumerate(codon_param_vec) + alpha = cp[1] + omegas = cp[2:end] + tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) - #Three plotting examples. - #Plot the alphas for each flagged site + # added for subtree prune + for x in pure_subclades + x.message = cached_messages[x][(alpha, omegas[cached_tag_inds[x]])] + end - lmargin = 7 + length(sites_to_plot) / 2 - ysize = 300 + 70 * length(sites[sites_to_plot]) - FUBAR_violin_plot(sites[sites_to_plot], alpha_volumes[sites_to_plot] .* 0.75, grd, tag="α", color="green", x_label="α") - plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + felsenstein!(tree, tagged_models) + #This combine!() is needed because the current site_LLs function applies to a partition + #And after a felsenstein pass, you don't have the eq freqs factored in. + #We could make a version of log_likelihood() that returns the partitions instead of just the sum + combine!.(tree.message, tree.parent_message) - savefig(analysis_name * "_violin_alpha.pdf") - Plots.CURRENT_PLOT.nullableplot = nothing # PyPlots close() + log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! + verbosity > 0 && if mod(row_ind, 500) == 1 + print(round(100 * row_ind / length(codon_param_vec)), "% ") + flush(stdout) + end + end - #Plot the G1 and G2 omegas - FUBAR_violin_plot(sites[sites_to_plot], group1_volumes[sites_to_plot], grd, tag="ω1", color=tag_colors[1]) - FUBAR_violin_plot(sites[sites_to_plot], group2_volumes[sites_to_plot], grd, tag="ω2", color=tag_colors[2], x_label="ω") - plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + verbosity > 0 && println() - savefig(analysis_name * "_violin_omegas.pdf") - Plots.CURRENT_PLOT.nullableplot = nothing + con_lik_matrix = zeros(size(log_con_lik_matrix)) + site_scalers = maximum(log_con_lik_matrix, dims=1) + for i in 1:num_sites + con_lik_matrix[:, i] .= exp.(log_con_lik_matrix[:, i] .- site_scalers[i]) + end - #Plot all three parameters, using the v_offset to separate the alphas from the omegas - FUBAR_violin_plot(sites[sites_to_plot], group1_volumes[sites_to_plot] .* 0.5, grd, tag="ω1", color=tag_colors[1], v_offset=-0.1) - FUBAR_violin_plot(sites[sites_to_plot], group2_volumes[sites_to_plot] .* 0.5, grd, tag="ω2", color=tag_colors[2], v_offset=-0.1) - FUBAR_violin_plot(sites[sites_to_plot], alpha_volumes[sites_to_plot] .* 0.5, grd, tag="α", color="green", v_offset=0.1) - plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds +end - savefig(analysis_name * "_violin_all_params.pdf") - Plots.CURRENT_PLOT.nullableplot = nothing - #Coerce the violin plot function to also viz the "detection" posteriors. - floored_detec = [clamp.((d .- 0.95) .* 20, 0.0, 1.0) for d in detections[sites_to_plot]] - println(sites_to_plot) - FUBAR_violin_plot(sites[sites_to_plot], [[f[1], 0.0, 0.0, 0.0] for f in floored_detec] .* 0.5, - ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω1>ω2)", color=tag_colors[1], - vertical_ind=nothing, plot_legend=false) - FUBAR_violin_plot(sites[sites_to_plot], [[0.0, f[2], 0.0, 0.0] for f in floored_detec] .* 0.5, - ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω2>ω1)", color=tag_colors[2], - vertical_ind=nothing, plot_legend=false) - FUBAR_violin_plot(sites[sites_to_plot], [[0.0, 0.0, f[3], 0.0] for f in floored_detec] .* 0.5, - ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω1>1)", color=tag_colors[1], - vertical_ind=nothing, plot_legend=false) - FUBAR_violin_plot(sites[sites_to_plot], [[0.0, 0.0, 0.0, f[4]] for f in floored_detec] .* 0.5, - ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω2>1)", color=tag_colors[2], - vertical_ind=nothing, legend_ncol=2, x_label="", plot_legend=false) - lmargin_detect = 12 + length(sites_to_plot) / 2 +#foreground_grid and background_grid control the number of categories below 1.0 +function difFUBAR_grid_pruned_2(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) - plot!(size=(800, ysize), margins=1Plots.cm, legend=false, grid=false, - ytickfont=18, bottom_margin=30mm, left_margin=(lmargin_detect)mm, - xtickfont=18) - println(length(sites_to_plot)) + cached_model = MG94_cacher(code) - savefig(analysis_name * "_detections.pdf") - Plots.CURRENT_PLOT.nullableplot = nothing + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 - if exports - Plots.CURRENT_PLOT.nullableplot = nothing - FUBAR_omega_plot(param_means, tag_colors, pos_thresh, detections, num_sites) - plot!(size=(1.25 * length(sites), 300), margins=1Plots.cm, grid=false, legendfontsize=8) - savefig(analysis_name * "_site_omega_means.pdf") + num_groups = length(tags) + is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + tensor_dims = 1 + num_groups + is_background + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] end - return df -end -export difFUBAR_tabulate + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec -#Must return enough to re-calculate detections etc -function difFUBAR(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) - analysis_name = outpath - tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) - tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) - con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code, - verbosity=verbosity, foreground_grid=6, background_grid=4) - alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) - df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) - #Return df, (tuple of partial calculations needed to re-run tablulate) - return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) -end -export difFUBAR + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + # getnodelist and get a specific node from nodelist + #function get_node_by_name(query_node::String, tree::FelNode) + # # This is a useful function that maybe should be in AbstractTreeNode.jl + # for node in getnodelist(tree) + # if node.name == query_node + # return node + # end + # end + #end + + function check_purity_from_node_and_forward_in_time(tree) + # collect all nodes under and see if they are the same group + # if return true that means everything under the tree is pure (the tree node can be differnt though) + node_groups = [] + for node in getnodelist(tree) + if tree.name != node.name # we only check nodes under the input tree or subtree + push!(node_groups, model_ind(node.name, tags)) + end + end + if length(unique(node_groups)) == 1 + return true + else + return false + end + end + + pure_clades = [] + function traverse_tree_to_check_for_pure_clades(pure_clades, tree) + for child in tree.children + if check_purity_from_node_and_forward_in_time(child) + #print(child.name) + #println(" PURE") + # add this node, as everything under this node is pure + push!(pure_clades, child) + else + #print(child.name) + #println(" not pure") # keep recursion to se if we go forward in time if we can find pure clades + traverse_tree_to_check_for_pure_clades(pure_clades, child) + end + end + return pure_clades + end + + time_elapsed = @elapsed subtree_tops = traverse_tree_to_check_for_pure_clades(pure_clades, tree) + println("Patrick prune " * string(time_elapsed)) + + cached_values = Dict() + for x in subtree_tops + #x = get_node_by_name(x, tree) + cached_values[x] = Dict() + parent = x.parent + x.parent = nothing + for cp in codon_param_vec + alpha = cp[1] + omegas = cp[2:end] + + relevant_omega = omegas[model_ind(x.name, tags)] #here we need to change to child omega + + if haskey(cached_values[x], (alpha, relevant_omega)) + continue + end + + models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + felsenstein!(x, models) + cached_values[x][(alpha, relevant_omega)] = deepcopy(x.message) + end + x.parent = parent + x.children = FelNode[] + end + #println("DEBUG") + #println(pure_clades) + + for (row_ind, cp) in enumerate(codon_param_vec) + alpha = cp[1] + omegas = cp[2:end] + tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + + # added for subtree prune + for x in pure_clades + #x = get_node_by_name(x, tree) + x.message = cached_values[x][(alpha, omegas[model_ind(x.name, tags)])] + end + + felsenstein!(tree, tagged_models) + #This combine!() is needed because the current site_LLs function applies to a partition + #And after a felsenstein pass, you don't have the eq freqs factored in. + #We could make a version of log_likelihood() that returns the partitions instead of just the sum + combine!.(tree.message, tree.parent_message) + + log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! + verbosity > 0 && if mod(row_ind, 500) == 1 + print(round(100 * row_ind / length(codon_param_vec)), "% ") + flush(stdout) + end + end + + verbosity > 0 && println() + + con_lik_matrix = zeros(size(log_con_lik_matrix)) + site_scalers = maximum(log_con_lik_matrix, dims=1) + for i in 1:num_sites + con_lik_matrix[:, i] .= exp.(log_con_lik_matrix[:, i] .- site_scalers[i]) + end + + return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds +end + +function difFUBAR_grid_pruned_3(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) + + cached_model = MG94_cacher(code) + + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 + + num_groups = length(tags) + #is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + is_background = false + tensor_dims = 1 + num_groups + is_background + + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end + + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec + + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) + + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + # getnodelist and get a specific node from nodelist + #function get_node_by_name(query_node::String, tree::FelNode) + # # This is a useful function that maybe should be in AbstractTreeNode.jl + # for node in getnodelist(tree) + # if node.name == query_node + # return node + # end + # end + #end + + function check_purity_from_node_and_forward_in_time(tree) + # collect all nodes under and see if they are the same group + # if return true that means everything under the tree is pure (the tree node can be differnt though) + node_groups = [] + for node in getnodelist(tree) + if tree.name != node.name # we only check nodes under the input tree or subtree + push!(node_groups, model_ind(node.name, tags)) + end + end + if length(unique(node_groups)) == 1 + return true + else + return false + end + end + + pure_clades = [] + function traverse_tree_to_check_for_pure_clades(pure_clades, tree) + for child in tree.children + if check_purity_from_node_and_forward_in_time(child) + #print(child.name) + #println(" PURE") + # add this node, as everything under this node is pure + push!(pure_clades, child) + else + #print(child.name) + #println(" not pure") # keep recursion to se if we go forward in time if we can find pure clades + traverse_tree_to_check_for_pure_clades(pure_clades, child) + end + end + return pure_clades + end + + time_elapsed = @elapsed subtree_tops = traverse_tree_to_check_for_pure_clades(pure_clades, tree) + println("Patrick prune " * string(time_elapsed)) + + cached_messages = Dict() + cached_tag_inds = Dict() + for x in subtree_tops + cached_messages[x] = Dict() + cached_tag_inds[x] = model_ind(x.children[1].name, tags) + parent = x.parent + x.parent = nothing + for cp in codon_param_vec + alpha = cp[1] + omegas = cp[2:end] + + relevant_omega = omegas[cached_tag_inds[x]] + + if haskey(cached_messages[x], (alpha, relevant_omega)) + continue + end + + models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + felsenstein!(x, models) + cached_messages[x][(alpha, relevant_omega)] = deepcopy(x.message) + end + x.parent = parent + x.children = FelNode[] + end + + for (row_ind, cp) in enumerate(codon_param_vec) + alpha = cp[1] + omegas = cp[2:end] + tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + + # added for subtree prune + for x in subtree_tops + x.message = cached_messages[x][(alpha, omegas[cached_tag_inds[x]])] + end + + felsenstein!(tree, tagged_models) + #This combine!() is needed because the current site_LLs function applies to a partition + #And after a felsenstein pass, you don't have the eq freqs factored in. + #We could make a version of log_likelihood() that returns the partitions instead of just the sum + combine!.(tree.message, tree.parent_message) + + log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! + verbosity > 0 && if mod(row_ind, 500) == 1 + print(round(100 * row_ind / length(codon_param_vec)), "% ") + flush(stdout) + end + end + + verbosity > 0 && println() + + con_lik_matrix = zeros(size(log_con_lik_matrix)) + site_scalers = maximum(log_con_lik_matrix, dims=1) + for i in 1:num_sites + con_lik_matrix[:, i] .= exp.(log_con_lik_matrix[:, i] .- site_scalers[i]) + end + + return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds +end + + +#foreground_grid and background_grid control the number of categories below 1.0 +function difFUBAR_grid_pruned_4(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) + + cached_model = MG94_cacher(code) + + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 + + num_groups = length(tags) + #is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + is_background = false + + tensor_dims = 1 + num_groups + is_background + + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end + + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec + + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) + + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + # getnodelist and get a specific node from nodelist + #function get_node_by_name(query_node::String, tree::FelNode) + # # This is a useful function that maybe should be in AbstractTreeNode.jl + # for node in getnodelist(tree) + # if node.name == query_node + # return node + # end + # end + #end + + function check_purity_from_node_and_forward_in_time(tree) + # collect all nodes under and see if they are the same group + # if return true that means everything under the tree is pure (the tree node can be differnt though) + node_groups = [] + for node in getnodelist(tree) + if tree.name != node.name # we only check nodes under the input tree or subtree + push!(node_groups, model_ind(node.name, tags)) + end + end + if length(unique(node_groups)) == 1 + return true + else + return false + end + end + + pure_clades = [] + function traverse_tree_to_check_for_pure_clades(pure_clades, tree) + for child in tree.children + if check_purity_from_node_and_forward_in_time(child) + #print(child.name) + #println(" PURE") + # add this node, as everything under this node is pure + push!(pure_clades, child) + else + #print(child.name) + #println(" not pure") # keep recursion to se if we go forward in time if we can find pure clades + traverse_tree_to_check_for_pure_clades(pure_clades, child) + end + end + return pure_clades + end + + time_elapsed = @elapsed subtree_tops = traverse_tree_to_check_for_pure_clades(pure_clades, tree) + println("Patrick prune " * string(time_elapsed)) + + cached_messages = Dict() + for x in subtree_tops + cached_messages[x] = Dict() + parent = x.parent + x.parent = nothing + for cp in codon_param_vec + alpha = cp[1] + omegas = cp[2:end] + + relevant_omega = omegas[model_ind(x.children[1].name, tags)] # children[1].name is the same as children[2].name + + if haskey(cached_messages[x], (alpha, relevant_omega)) + continue + end + + models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + felsenstein!(x, models) + cached_messages[x][(alpha, relevant_omega)] = deepcopy(x.message) + end + x.parent = parent + end + + for (row_ind, cp) in enumerate(codon_param_vec) + alpha = cp[1] + omegas = cp[2:end] + tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + + # added for subtree prune + pure_children = [] + for x in subtree_tops + push!(pure_children, x.children) + x.message = cached_messages[x][(alpha, omegas[model_ind(x.children[1].name, tags)])] + x.children = FelNode[] + end + + felsenstein!(tree, tagged_models) + #This combine!() is needed because the current site_LLs function applies to a partition + #And after a felsenstein pass, you don't have the eq freqs factored in. + #We could make a version of log_likelihood() that returns the partitions instead of just the sum + combine!.(tree.message, tree.parent_message) + + log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! + verbosity > 0 && if mod(row_ind, 500) == 1 + print(round(100 * row_ind / length(codon_param_vec)), "% ") + flush(stdout) + end + + for (i, x) in enumerate(subtree_tops) + x.children = pure_children[i] + end + end + + verbosity > 0 && println() + + con_lik_matrix = zeros(size(log_con_lik_matrix)) + site_scalers = maximum(log_con_lik_matrix, dims=1) + for i in 1:num_sites + con_lik_matrix[:, i] .= exp.(log_con_lik_matrix[:, i] .- site_scalers[i]) + end + + return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds +end + +function difFUBAR_grid_final(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) + + cached_model = MG94_cacher(code) + + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 + + num_groups = length(tags) + #is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree) if !isroot(n)]) > num_groups + + tensor_dims = 1 + num_groups + is_background + + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end + + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec + + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) + + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + function check_purity_from_node_and_forward_in_time(tree) + # collect all nodes under and see if they are the same group + # if return true that means everything under the tree is pure (the tree node can be differnt though) + node_groups = [] + for node in getnodelist(tree) + if tree.name != node.name # we only check nodes under the input tree or subtree + push!(node_groups, model_ind(node.name, tags)) + end + end + if length(unique(node_groups)) == 1 + return true + else + return false + end + end + + pure_clades = [] + function traverse_tree_to_check_for_pure_clades(pure_clades, tree) + for child in tree.children + if check_purity_from_node_and_forward_in_time(child) + # add this node, as everything under this node is pure + push!(pure_clades, child) + else + traverse_tree_to_check_for_pure_clades(pure_clades, child) + end + end + return pure_clades + end + + time_elapsed = @elapsed subtree_tops = traverse_tree_to_check_for_pure_clades(pure_clades, tree) + println("Patrick prune " * string(time_elapsed)) + + cached_messages = Dict() + cached_tag_inds = Dict() + for x in subtree_tops + cached_messages[x] = Dict() + cached_tag_inds[x] = model_ind(x.children[1].name, tags) + parent = x.parent + x.parent = nothing + for cp in codon_param_vec + alpha = cp[1] + omegas = cp[2:end] + + relevant_omega = omegas[cached_tag_inds[x]] + + if haskey(cached_messages[x], (alpha, relevant_omega)) + continue + end + + models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + felsenstein!(x, models) + cached_messages[x][(alpha, relevant_omega)] = deepcopy(x.message) + end + x.parent = parent + x.children = FelNode[] + end + + for (row_ind, cp) in enumerate(codon_param_vec) + alpha = cp[1] + omegas = cp[2:end] + tagged_models = N_Omegas_model_func(tags, omegas, alpha, GTRmat, F3x4_freqs, code) + + # added for subtree prune + for x in subtree_tops + x.message = cached_messages[x][(alpha, omegas[cached_tag_inds[x]])] + end + + felsenstein!(tree, tagged_models) + #This combine!() is needed because the current site_LLs function applies to a partition + #And after a felsenstein pass, you don't have the eq freqs factored in. + #We could make a version of log_likelihood() that returns the partitions instead of just the sum + combine!.(tree.message, tree.parent_message) + + log_con_lik_matrix[row_ind, :] .= MolecularEvolution.site_LLs(tree.message[1]) #Check that these grab the scaling constants as well! + verbosity > 0 && if mod(row_ind, 500) == 1 + print(round(100 * row_ind / length(codon_param_vec)), "% ") + flush(stdout) + end + end + + verbosity > 0 && println() + + con_lik_matrix = zeros(size(log_con_lik_matrix)) + site_scalers = maximum(log_con_lik_matrix, dims=1) + for i in 1:num_sites + con_lik_matrix[:, i] .= exp.(log_con_lik_matrix[:, i] .- site_scalers[i]) + end + + return con_lik_matrix, log_con_lik_matrix, codon_param_vec, alphagrid, omegagrid, param_kinds +end + + +function difFUBAR_grid_pruned_3_return_subtree_tops(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) + + cached_model = MG94_cacher(code) + + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 + + num_groups = length(tags) + is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + tensor_dims = 1 + num_groups + is_background + + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end + + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec + + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) + + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + # getnodelist and get a specific node from nodelist + #function get_node_by_name(query_node::String, tree::FelNode) + # # This is a useful function that maybe should be in AbstractTreeNode.jl + # for node in getnodelist(tree) + # if node.name == query_node + # return node + # end + # end + #end + + function check_purity_from_node_and_forward_in_time(tree) + # collect all nodes under and see if they are the same group + # if return true that means everything under the tree is pure (the tree node can be differnt though) + node_groups = [] + for node in getnodelist(tree) + if tree.name != node.name # we only check nodes under the input tree or subtree + push!(node_groups, model_ind(node.name, tags)) + end + end + if length(unique(node_groups)) == 1 + return true + else + return false + end + end + + pure_clades = [] + function traverse_tree_to_check_for_pure_clades(pure_clades, tree) + for child in tree.children + if check_purity_from_node_and_forward_in_time(child) + #print(child.name) + #println(" PURE") + # add this node, as everything under this node is pure + push!(pure_clades, child) + else + #print(child.name) + #println(" not pure") # keep recursion to se if we go forward in time if we can find pure clades + traverse_tree_to_check_for_pure_clades(pure_clades, child) + end + end + return pure_clades + end + + time_elapsed = @elapsed subtree_tops = traverse_tree_to_check_for_pure_clades(pure_clades, tree) + return subtree_tops +end + +#foreground_grid and background_grid control the number of categories below 1.0 +function difFUBAR_grid_pruned_1_return_subtree_tops(tree, tags, GTRmat, F3x4_freqs, code; verbosity=1, foreground_grid=6, background_grid=4) + + cached_model = MG94_cacher(code) + + #This is the function that assigns models to branches + #Sometimes there will be the same number of tags as omegas, but sometimes there will be one more omega. + #Depending on whether there are any background branches (UNTESTED WITH NO BACKGROUND BRANCHES) + function N_Omegas_model_func(tags, omega_vec, alpha, nuc_mat, F3x4, code) + models = [cached_model(alpha, alpha * o, nuc_mat, F3x4, genetic_code=code) for o in omega_vec] + return n::FelNode -> [models[model_ind(n.name, tags)]] + end + + #Defines the grid used for inference. + function gridsetup(lb, ub, num_below_one, trin, tr) + step = (trin(1.0) - trin(lb)) / num_below_one + return tr.(trin(lb):step:trin(ub)) + end + tr(x) = 10^x - 0.05 + trinv(x) = log10(x + 0.05) + alphagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + omegagrid = gridsetup(0.01, 13.0, foreground_grid, trinv, tr) + background_omega_grid = gridsetup(0.05, 6.0, background_grid, trinv, tr) #Much coarser, because this isn't a target of inference + length(background_omega_grid) * length(alphagrid) * length(omegagrid)^2 + + num_groups = length(tags) + is_background = maximum([model_ind(n.name, tags) for n in getnodelist(tree)]) > num_groups + tensor_dims = 1 + num_groups + is_background + + function add_to_each_element(vec_of_vec, elems) + return [vcat(v, [e]) for v in vec_of_vec for e in elems] + end + + codon_param_vec = [[a] for a in alphagrid] + param_kinds = ["Alpha"] + for g in 1:num_groups + push!(param_kinds, "OmegaG$(g)") + codon_param_vec = add_to_each_element(codon_param_vec, omegagrid) + end + if is_background + push!(param_kinds, "OmegaBackground") + codon_param_vec = add_to_each_element(codon_param_vec, background_omega_grid) + end + codon_param_vec + + num_sites = tree.message[1].sites + l = length(codon_param_vec) + log_con_lik_matrix = zeros(l, num_sites) + + verbosity > 0 && println("Step 3: Calculating grid of $(length(codon_param_vec))-by-$(tree.message[1].sites) conditional likelihood values (the slowest step). Currently on:") + + + # modify the tree + + # compute recursion G1 + # compute recursion G2 + + # How does the omega influence the propagation? + + pure_subclades = FelNode[] + # Traverses the tree recursively with a dfs whilst pushing roots of pure subclades to list^ + function find_pure_subclades(node) + # Get the index of the node's tag + tag_ind_of_node = model_ind(node.name, tags) + + # If the node is a leaf, it's pure + if length(node.children) == 0 + return true, tag_ind_of_node + end + + children_is_pure_and_tag = [] + pure_children = FelNode[] + + for child in node.children + child_is_pure, tag_ind = find_pure_subclades(child) + if child_is_pure + push!(pure_children, child) + end + push!(children_is_pure_and_tag, (child_is_pure, tag_ind)) + end + + # Get the index of the node's first child's tag + tag_ind_of_first_child = children_is_pure_and_tag[1][2] + + # This is the case where the subclade starting at node is pure + if all(x == (true, tag_ind_of_first_child) for x in children_is_pure_and_tag) + if tag_ind_of_node != tag_ind_of_first_child + # The purity is broken at this node + push!(pure_subclades, node) + return false, tag_ind_of_node + end + # The purity is not broken at this node + return true, tag_ind_of_node + end + + # This is the case where some child has mixed tags or the children are pure with regards to different tags + for pure_child in pure_children + if length(pure_child.children) == 0 + # We don't want to push leaves into pure_subclades + continue + end + push!(pure_subclades, pure_child) + end + return false, tag_ind_of_node + end + + time_elapsed = @elapsed find_pure_subclades(tree) + return pure_subclades +end + + +function difFUBAR_sample(con_lik_matrix, iters; verbosity=1) + verbosity > 0 && println("Step 4: Running Gibbs sampler to infer site categories.") + alloc_grid, theta = LDA_gibbs_track_allocation_vec(con_lik_matrix, 0.1, iters=iters) + return alloc_grid, theta +end + +function difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=1, sites_to_plot=nothing, exports=true) + grid_size, num_sites = size(alloc_grid) + + r(s) = round(s, digits=4) + + detected_sites = Int64[] + group1_volumes = Vector{Float64}[] + group2_volumes = Vector{Float64}[] + alpha_volumes = Vector{Float64}[] + detections = Vector{Float64}[] #legacy name - now includes all 4 "relevant" site posteriors + param_means = Vector{Float64}[] + + ω1 = [c[2] for c in codon_param_vec] + ω2 = [c[3] for c in codon_param_vec] + alphas = [c[1] for c in codon_param_vec] + ω1_greater_filt = ω1 .> ω2 + ω2_greater_filt = ω2 .> ω1 + ω1_pos_filt = ω1 .> 1.0 + ω2_pos_filt = ω2 .> 1.0 + + verbosity > 0 && println("Step 5: Tabulating and plotting. Detected sites:") + for site in 1:num_sites + ω1_greater_posterior = sum(alloc_grid[ω1_greater_filt, site]) / sum(alloc_grid[:, site]) + ω2_greater_posterior = sum(alloc_grid[ω2_greater_filt, site]) / sum(alloc_grid[:, site]) + ω1_pos_posterior = sum(alloc_grid[ω1_pos_filt, site]) / sum(alloc_grid[:, site]) + ω2_pos_posterior = sum(alloc_grid[ω2_pos_filt, site]) / sum(alloc_grid[:, site]) + detecs = [ω1_greater_posterior, ω2_greater_posterior, ω1_pos_posterior, ω2_pos_posterior] + + site_counts_ω1 = collapse_counts(ω1, alloc_grid[:, site], cases=omegagrid) + site_counts_ω2 = collapse_counts(ω2, alloc_grid[:, site], cases=omegagrid) + site_counts_alphas = collapse_counts(alphas, alloc_grid[:, site], cases=alphagrid) + + mean_alpha = sum(site_counts_alphas .* alphagrid) + mean_ω1 = sum(site_counts_ω1 .* omegagrid) + mean_ω2 = sum(site_counts_ω2 .* omegagrid) + + push!(detections, detecs) + push!(param_means, [mean_alpha, mean_ω1, mean_ω2]) + push!(group1_volumes, site_counts_ω1) + push!(group2_volumes, site_counts_ω2) + push!(alpha_volumes, site_counts_alphas) + + if maximum(detecs) > pos_thresh + verbosity > 0 && print("Site $(site) - ") + verbosity > 0 && print("P(ω1 > ω2):", ω1_greater_posterior) + verbosity > 0 && print("; P(ω2 > ω1):", ω2_greater_posterior) + verbosity > 0 && print("; P(ω1 > 1):", ω1_pos_posterior) + verbosity > 0 && println("; P(ω2 > 1):", ω2_pos_posterior) + push!(detected_sites, site) + end + end + + #Exporting site data + df = DataFrame() + df[!, "Codon Sites"] = [1:num_sites;] + df[!, "P(ω1 > ω2)"] = [d[1] for d in detections] + df[!, "P(ω2 > ω1)"] = [d[2] for d in detections] + df[!, "P(ω1 > 1)"] = [d[3] for d in detections] + df[!, "P(ω2 > 1)"] = [d[4] for d in detections] + df[!, "mean(α)"] = [d[1] for d in param_means] + df[!, "mean(ω1)"] = [d[2] for d in param_means] + df[!, "mean(ω2)"] = [d[3] for d in param_means] + + verbosity > 0 && println("\nIf exports = true, writing results for all sites to CSV: " * analysis_name * "_posteriors.csv") + exports && CSV.write(analysis_name * "_posteriors.csv", df) + + sites = [1:num_sites;] + + #Select the sites that will get plotted, in case you want to customize this. + if isnothing(sites_to_plot) + sites_to_plot = detected_sites + end + + if length(sites_to_plot) == 0 + verbosity > 0 && println("No sites detected above threshold.") + elseif exports + verbosity > 0 && println("Plotting alpha and omega distributions. If exports = true, saved as " * analysis_name * "_violin_*.pdf") + + #Assumes alpha and omega grids are the same!? Currently enforced by args passed into difFUBAR_grid + #Maybe this is ok + grd = round.(omegagrid, digits=3) + + #Three plotting examples. + #Plot the alphas for each flagged site + + lmargin = 7 + length(sites_to_plot) / 2 + ysize = 300 + 70 * length(sites[sites_to_plot]) + FUBAR_violin_plot(sites[sites_to_plot], alpha_volumes[sites_to_plot] .* 0.75, grd, tag="α", color="green", x_label="α") + plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + + savefig(analysis_name * "_violin_alpha.pdf") + Plots.CURRENT_PLOT.nullableplot = nothing # PyPlots close() + + #Plot the G1 and G2 omegas + FUBAR_violin_plot(sites[sites_to_plot], group1_volumes[sites_to_plot], grd, tag="ω1", color=tag_colors[1]) + FUBAR_violin_plot(sites[sites_to_plot], group2_volumes[sites_to_plot], grd, tag="ω2", color=tag_colors[2], x_label="ω") + plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + + savefig(analysis_name * "_violin_omegas.pdf") + Plots.CURRENT_PLOT.nullableplot = nothing + + #Plot all three parameters, using the v_offset to separate the alphas from the omegas + FUBAR_violin_plot(sites[sites_to_plot], group1_volumes[sites_to_plot] .* 0.5, grd, tag="ω1", color=tag_colors[1], v_offset=-0.1) + FUBAR_violin_plot(sites[sites_to_plot], group2_volumes[sites_to_plot] .* 0.5, grd, tag="ω2", color=tag_colors[2], v_offset=-0.1) + FUBAR_violin_plot(sites[sites_to_plot], alpha_volumes[sites_to_plot] .* 0.5, grd, tag="α", color="green", v_offset=0.1) + plot!(size=(400, ysize), grid=false, left_margin=(lmargin)mm, bottom_margin=10mm) + + savefig(analysis_name * "_violin_all_params.pdf") + Plots.CURRENT_PLOT.nullableplot = nothing + + #Coerce the violin plot function to also viz the "detection" posteriors. + floored_detec = [clamp.((d .- 0.95) .* 20, 0.0, 1.0) for d in detections[sites_to_plot]] + println(sites_to_plot) + FUBAR_violin_plot(sites[sites_to_plot], [[f[1], 0.0, 0.0, 0.0] for f in floored_detec] .* 0.5, + ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω1>ω2)", color=tag_colors[1], + vertical_ind=nothing, plot_legend=false) + FUBAR_violin_plot(sites[sites_to_plot], [[0.0, f[2], 0.0, 0.0] for f in floored_detec] .* 0.5, + ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω2>ω1)", color=tag_colors[2], + vertical_ind=nothing, plot_legend=false) + FUBAR_violin_plot(sites[sites_to_plot], [[0.0, 0.0, f[3], 0.0] for f in floored_detec] .* 0.5, + ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω1>1)", color=tag_colors[1], + vertical_ind=nothing, plot_legend=false) + FUBAR_violin_plot(sites[sites_to_plot], [[0.0, 0.0, 0.0, f[4]] for f in floored_detec] .* 0.5, + ["P(ω1>ω2)", "P(ω2>ω1)", "P(ω1>1)", "P(ω2>1)"], tag="P(ω2>1)", color=tag_colors[2], + vertical_ind=nothing, legend_ncol=2, x_label="", plot_legend=false) + + lmargin_detect = 12 + length(sites_to_plot) / 2 + + plot!(size=(800, ysize), margins=1Plots.cm, legend=false, grid=false, + ytickfont=18, bottom_margin=30mm, left_margin=(lmargin_detect)mm, + xtickfont=18) + println(length(sites_to_plot)) + + savefig(analysis_name * "_detections.pdf") + Plots.CURRENT_PLOT.nullableplot = nothing + + end + + if exports + Plots.CURRENT_PLOT.nullableplot = nothing + FUBAR_omega_plot(param_means, tag_colors, pos_thresh, detections, num_sites) + plot!(size=(1.25 * length(sites), 300), margins=1Plots.cm, grid=false, legendfontsize=8) + savefig(analysis_name * "_site_omega_means.pdf") + + end + + return df +end +export difFUBAR_tabulate + +#Must return enough to re-calculate detections etc +function difFUBAR(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR + +#Must return enough to re-calculate detections etc +function difFUBAR_prune_max(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid_pruned_1(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR_prune_max + + +#Must return enough to re-calculate detections etc +function difFUBAR_prune_patrick(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid_pruned_2(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR_prune_patrick + From 38c5e6acb6c49ae78e507db9fd6c7ec69775b1d6 Mon Sep 17 00:00:00 2001 From: patruong Date: Mon, 18 Dec 2023 13:08:18 +0100 Subject: [PATCH 03/11] update difFUBAR.jl --- src/difFUBAR/difFUBAR.jl | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/difFUBAR/difFUBAR.jl b/src/difFUBAR/difFUBAR.jl index 75f6206..ba59803 100644 --- a/src/difFUBAR/difFUBAR.jl +++ b/src/difFUBAR/difFUBAR.jl @@ -1529,3 +1529,44 @@ function difFUBAR_prune_patrick(seqnames, seqs, treestring, tags, tag_colors, ou end export difFUBAR_prune_patrick +function difFUBAR_prune_patrick_max(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid_pruned_3(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR_prune_patrick_max + +function difFUBAR_prune_patrick_max_child(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid_pruned_4(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR_prune_patrick_max_child + +function difFUBAR_prune_final(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thresh=0.95, iters=2500, verbosity=1, exports=true, code=MolecularEvolution.universal_code) + analysis_name = outpath + tree, tags, tag_colors, analysis_name = difFUBAR_init(analysis_name, treestring, tags, tag_colors, exports=exports, verbosity=verbosity) + tree, alpha, beta, GTRmat, F3x4_freqs, eq_freqs = difFUBAR_global_fit(seqnames, seqs, tree, generate_tag_stripper(tags), code, verbosity=verbosity) + con_lik_matrix, _, codon_param_vec, alphagrid, omegagrid, _ = difFUBAR_grid_final(tree, tags, GTRmat, F3x4_freqs, code, + verbosity=verbosity, foreground_grid=6, background_grid=4) + alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) + df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + + #Return df, (tuple of partial calculations needed to re-run tablulate) + return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) +end +export difFUBAR_prune_final From 34d52fb9ea37de0a5b82e614bae8462fb441ed14 Mon Sep 17 00:00:00 2001 From: patruong Date: Mon, 18 Dec 2023 16:21:25 +0100 Subject: [PATCH 04/11] update --- src/difFUBAR/difFUBAR.jl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/difFUBAR/difFUBAR.jl b/src/difFUBAR/difFUBAR.jl index ba59803..83c752e 100644 --- a/src/difFUBAR/difFUBAR.jl +++ b/src/difFUBAR/difFUBAR.jl @@ -1493,6 +1493,10 @@ function difFUBAR(seqnames, seqs, treestring, tags, tag_colors, outpath; pos_thr alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end @@ -1508,6 +1512,10 @@ function difFUBAR_prune_max(seqnames, seqs, treestring, tags, tag_colors, outpat alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end @@ -1524,6 +1532,10 @@ function difFUBAR_prune_patrick(seqnames, seqs, treestring, tags, tag_colors, ou alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end @@ -1538,6 +1550,10 @@ function difFUBAR_prune_patrick_max(seqnames, seqs, treestring, tags, tag_colors alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end @@ -1552,6 +1568,10 @@ function difFUBAR_prune_patrick_max_child(seqnames, seqs, treestring, tags, tag_ alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end @@ -1566,6 +1586,10 @@ function difFUBAR_prune_final(seqnames, seqs, treestring, tags, tag_colors, outp alloc_grid, theta = difFUBAR_sample(con_lik_matrix, iters, verbosity=verbosity) df = difFUBAR_tabulate(analysis_name, pos_thresh, alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors; verbosity=verbosity, exports=exports) + # added for testing + con_lik_df = DataFrame(Tables.table(con_lik_matrix)) + CSV.write(analysis_name * "_con_lik_matrix.csv", con_lik_df, writeheader=false) + #Return df, (tuple of partial calculations needed to re-run tablulate) return df, (alloc_grid, codon_param_vec, alphagrid, omegagrid, tag_colors) end From e57071de05d22cf5c9dde53efb9a7bd1ba6a31f5 Mon Sep 17 00:00:00 2001 From: patruong Date: Thu, 13 Jun 2024 08:30:44 +0000 Subject: [PATCH 05/11] added more functions for importing newick file and recursive retagging --- src/shared/shared.jl | 92 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 85d0dc7..abbd89d 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -100,7 +100,6 @@ function replace_newick_tags(treestr) pattern = r"\{([^}]+)\}" unique_tags = Set{String}() for match in eachmatch(pattern, treestr) - println(match) push!(unique_tags, match.match) end unique_tags = collect(unique_tags) @@ -141,16 +140,93 @@ function generate_hex_colors(num_colors) end -function import_grouped_label_tree(tree_file) +function generate_hex_colors_predefined(num_colors) + + if num_colors < 1 + throw(ArgumentError("num_colors must be a positive integer")) + end + color_palette = ["#0000FF", "#FF0000", "#008000", "#FFFF00", "#800080", "#000080", "#808080"] + + if num_colors <= 7 + return color_palette[1:num_colors] + else + colors = color_palette * (num_colors // 7) + colors += color_palette[1:num_colors%7] + return colors + end +end + + + +function get_group_from_name(input_string) + group_pattern = r"\{(.+?)\}" + match_result = match(group_pattern, input_string) + if match_result != nothing + return match_result.match + else + return "" + end +end + + +function remove_internal_node_tags(tree) + # This function so we dont get double tagging in the recursive_tagging + pattern = r"\{([^}]+)\}" + for node in getnonleaflist(tree) + res = match(pattern, node.name) + if res !== nothing + node.name = replace(node.name, res.match => "") + else + node.name = node.name + end + end +end + + +function recursive_tagging(node) + if isleafnode(node) + return + end + + + left_child = recursive_tagging(node.children[1]) + right_child = recursive_tagging(node.children[2]) + + if get_group_from_name(node.children[1].name) == get_group_from_name(node.children[2].name) + group = get_group_from_name(node.children[1].name) + else + group = "" + end + node.name = node.name * group + +end + + +function remove_groups(input::AbstractString) + result = replace(input, r"\{[^}]*\}" => "") + return result +end + + +function import_grouped_labelled_newick_tree(tree_file::String, recursive_retagging::Bool=false) # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag tree = read_newick_tree(tree_file) - treestring = newick(tree) - treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) - tag_colors = generate_hex_colors(length(original_tags)) + + if recursive_retagging == false + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) + tag_colors = generate_hex_colors_predefined(length(original_tags)) + else + remove_internal_node_tags(tree) + recursive_tagging(tree) + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = CodonMolecularEvolution.replace_newick_tags(treestring) + tag_colors = CodonMolecularEvolution.generate_hex_colors_predefined(length(original_tags)) + end return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors end -export import_grouped_label_tree +export import_grouped_labelled_newick_tree function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) @@ -185,7 +261,6 @@ Optimizes the MG94+F3x4 model on a tree, given a set of sequences and a tree. Re The leaf_name_transform kwarg can be used to transform the leaf names in the tree to match the seqnames. """ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code) - #Count F3x4 frequencies from the seqs, and estimate codon freqs from this f3x4 = MolecularEvolution.count_F3x4(seqs) eq_freqs = MolecularEvolution.F3x4_eq_freqs(f3x4) @@ -220,9 +295,8 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge upper_bounds!(opt, [5.0 for i in 1:num_params]) xtol_rel!(opt, 1e-12) _, mini, _ = NLopt.optimize(opt, flat_initial_params) - final_params = unflatten(mini) - + final_params.rates #tree, alpha, beta, nuc_matrix, F3x4, eq_freqs return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs end From 55311a70716e3aeeb465f6eea3a5d0c477d8c759 Mon Sep 17 00:00:00 2001 From: patruong Date: Thu, 13 Jun 2024 08:50:01 +0000 Subject: [PATCH 06/11] added more functions for recursive retagging --- src/shared/shared.jl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index abbd89d..545055d 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -157,6 +157,16 @@ function generate_hex_colors_predefined(num_colors) end +function get_node_by_name(tree, nodename) + mynode = [] + for node in getnodelist(tree) + if node.name == nodename + push!(mynode, node) + end + end + mynode = mynode[1] + return mynode +end function get_group_from_name(input_string) group_pattern = r"\{(.+?)\}" @@ -229,6 +239,31 @@ end export import_grouped_labelled_newick_tree + +function import_grouped_labelled_newick_tree_reroot(tree_file::String, node_name::String, dist_above_child::Float64, recursive_retagging::Bool=false) + # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag + tree = read_newick_tree(tree_file) + node = get_node_by_name(tree, node_name) + tree = MolecularEvolution.recursive_reroot(node, dist_above_child=dist_above_child) + + if recursive_retagging == false + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) + tag_colors = generate_hex_colors_predefined(length(original_tags)) + else + remove_internal_node_tags(tree) + recursive_tagging(tree) + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = CodonMolecularEvolution.replace_newick_tags(treestring) + tag_colors = CodonMolecularEvolution.generate_hex_colors_predefined(length(original_tags)) + end + return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors +end + +export import_grouped_labelled_newick_tree_reroot + + + function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) # If there are more than 2 tags in the newick tree, we split up the tags to the tags to be analyzed and the tags to be removed (placed as background) analysis_tags = tags[tag_pos] From 80199f037eb95e8676c78f871555eb826be5170c Mon Sep 17 00:00:00 2001 From: patruong Date: Thu, 13 Jun 2024 08:55:30 +0000 Subject: [PATCH 07/11] added more retagging functions --- src/shared/shared.jl | 162 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 130 insertions(+), 32 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 545055d..d413196 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -156,7 +156,6 @@ function generate_hex_colors_predefined(num_colors) end end - function get_node_by_name(tree, nodename) mynode = [] for node in getnodelist(tree) @@ -239,31 +238,6 @@ end export import_grouped_labelled_newick_tree - -function import_grouped_labelled_newick_tree_reroot(tree_file::String, node_name::String, dist_above_child::Float64, recursive_retagging::Bool=false) - # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag - tree = read_newick_tree(tree_file) - node = get_node_by_name(tree, node_name) - tree = MolecularEvolution.recursive_reroot(node, dist_above_child=dist_above_child) - - if recursive_retagging == false - treestring = newick(tree) - treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) - tag_colors = generate_hex_colors_predefined(length(original_tags)) - else - remove_internal_node_tags(tree) - recursive_tagging(tree) - treestring = newick(tree) - treestring_group_labeled, group_tags, original_tags = CodonMolecularEvolution.replace_newick_tags(treestring) - tag_colors = CodonMolecularEvolution.generate_hex_colors_predefined(length(original_tags)) - end - return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors -end - -export import_grouped_labelled_newick_tree_reroot - - - function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) # If there are more than 2 tags in the newick tree, we split up the tags to the tags to be analyzed and the tags to be removed (placed as background) analysis_tags = tags[tag_pos] @@ -296,14 +270,17 @@ Optimizes the MG94+F3x4 model on a tree, given a set of sequences and a tree. Re The leaf_name_transform kwarg can be used to transform the leaf names in the tree to match the seqnames. """ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code) + #Count F3x4 frequencies from the seqs, and estimate codon freqs from this f3x4 = MolecularEvolution.count_F3x4(seqs) eq_freqs = MolecularEvolution.F3x4_eq_freqs(f3x4) #Set up a codon partition (will default to Universal genetic code) - initial_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) - initial_partition.state .= eq_freqs + eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) + eq_partition.state .= eq_freqs + initial_partition = LazyPartition{CodonPartition}(nothing) populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) + lazyprep!(tree, [eq_partition]) #We'll use the empirical F3x4 freqs, fixed MG94 alpha=1, and optimize the nuc parameters and MG94 beta #Note: the nuc rates are confounded with alpha @@ -330,14 +307,136 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge upper_bounds!(opt, [5.0 for i in 1:num_params]) xtol_rel!(opt, 1e-12) _, mini, _ = NLopt.optimize(opt, flat_initial_params) + final_params = unflatten(mini) - final_params.rates + #tree, alpha, beta, nuc_matrix, F3x4, eq_freqs return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs end +function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code, optimize_branch_lengths=false) + #Optimize mu params using a nuc model, mu denotes the nucleotide mutational biases + nuc_pi = char_proportions(seqs, MolecularEvolution.nucstring) + + eq_partition = NucleotidePartition(length(seqs[1])) + eq_partition.state .= nuc_pi + if optimize_branch_lengths == true + populate_tree!(tree, eq_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) + else + initial_partition = LazyPartition{NucleotidePartition}(nothing) + populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) + lazyprep!(tree, [eq_partition]) + end + + initial_params = positive(ones(6)) #rates must be non-negative + + flat_initial_params, unflatten = value_flatten(initial_params) #See ParameterHandling.jl docs + num_params = length(flat_initial_params) + + function build_model_vec(p; nuc_pi=nuc_pi) + return GeneralCTMC(reversibleQ(p, nuc_pi)) + end + + function objective(params; tree=tree) + return -log_likelihood!(tree, build_model_vec(params)) + end + + opt = Opt(:LN_BOBYQA, num_params) + min_objective!(opt, (x, y) -> (objective ∘ unflatten)(x)) + lower_bounds!(opt, [-5.0 for i in 1:num_params]) + upper_bounds!(opt, [5.0 for i in 1:num_params]) + xtol_rel!(opt, 1e-12) + _, mini, _ = NLopt.optimize(opt, flat_initial_params) + + final_params = unflatten(mini) + #tree, nuc mu rates, nuc pi rates + return tree, final_params, nuc_pi +end + +function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code) + #Now we optimize alpha and beta rates using a codon model + #Count F3x4 frequencies from the seqs, and estimate codon freqs from this + f3x4 = MolecularEvolution.count_F3x4(seqs) + eq_freqs = MolecularEvolution.F3x4_eq_freqs(f3x4) + + #Set up a codon partition (will default to Universal genetic code) + eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) + eq_partition.state .= eq_freqs + initial_partition = LazyPartition{CodonPartition}(nothing) + populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) + lazyprep!(tree, [eq_partition]) + + function build_model_vec(alpha, beta; F3x4=f3x4) + #Need to pass through genetic code here! + #If you run into numerical issues with DiagonalizedCTMC, switch to GeneralCTMC instead + return GeneralCTMC(MolecularEvolution.MG94_F3x4(alpha, beta, GTRmat, F3x4, genetic_code=genetic_code)) + end + + function objective(alpha=1, beta=1; tree=tree, eq_freqs=eq_freqs) + return log_likelihood!(tree, build_model_vec(alpha, beta)) + end + + #High tolerance estimates of alpha then beta + alpha, beta = 1.0, 1.0 + num_1d_optims = 3 #We'll do this many 1D optimizations, first for alpha, then beta, then alpha again and so forth + initial_param = 1.0 #rate must be non-negative + lower_bound = 0.0 + upper_bound = 5.0 + high_tol = 1e-4 + for i in 1:num_1d_optims-1 + isodd = i % 2 == 1 + if isodd + alpha = brents_method_minimize(x -> -objective(x, beta), lower_bound, upper_bound, identity, high_tol) + @show Int(ceil(log(high_tol / (upper_bound - lower_bound)) / log(MolecularEvolution.invphi))) + #alpha = golden_section_maximize(x -> objective(x, beta), lower_bound, upper_bound, identity, high_tol) + #@show alpha1, alpha + else + beta = brents_method_minimize(x -> -objective(alpha, x), lower_bound, upper_bound, identity, high_tol) + #beta = golden_section_maximize(x -> objective(alpha, x), lower_bound, upper_bound, identity, high_tol) + #@show beta1, beta + end + end + #Low tolerance final estimate of alpha + """ + the parabola thing is: + take 3 values of the parameter, + and eval the LL. fit the parabola, + and take the max of the parabola, + then swap out the worst LL in the previous set for the new max, + and repeat with those 3 values + // Benjamin Murrell in a teams convo + """ + low_tol = 1e-7 + ε = 1e-2 + prev_alpha = Inf + maxiters = 20 #We should be quite close to the final alpha + iters = 0 + xvec = [alpha - ε, alpha, alpha + ε] + yvec = map(x -> objective(x, beta), xvec) + while abs(alpha - prev_alpha) > low_tol && iters < maxiters + prev_alpha = alpha + if iters > 0 #We use this order to avoid one potential objective call + wipe_index = argmin(yvec) + max_obj = objective(alpha, beta) + xvec[wipe_index] = alpha + yvec[wipe_index] = max_obj + perm = sortperm(xvec) + xvec = xvec[perm] + yvec = yvec[perm] + end + alpha = MolecularEvolution.new_max(xvec, yvec) + iters += 1 + end + #tree, alpha, beta, F3x4, eq_freqs + return tree, alpha, beta, f3x4, eq_freqs +end +function rescale_branchlengths!(tree, scale) + for node in getnodelist(tree) + node.branchlength *= scale + end +end ########################################## #Possibly migrate to MolecularEvolution.jl @@ -411,7 +510,7 @@ function MolecularEvolution.populate_tree!( if init_all_messages internal_message_init!(tree, starting_message) else - tree.parent_message = deepcopy(starting_message) + tree.parent_message = copy_message(starting_message) end name_dic = Dict(zip(names, 1:length(names))) for n in getleaflist(tree) @@ -448,5 +547,4 @@ function MolecularEvolution.populate_tree!( tolerate_missing=tolerate_missing, leaf_name_transform=leaf_name_transform ) -end - +end \ No newline at end of file From 1a08afbedc78e6648ffcb0cecbc49e12cb9125a7 Mon Sep 17 00:00:00 2001 From: patruong Date: Thu, 13 Jun 2024 14:13:23 +0200 Subject: [PATCH 08/11] Add import and retagging functions for Newick tree handling --- src/shared/shared.jl | 93 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 8 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 4c990b1..4d0a4a7 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -140,16 +140,93 @@ function generate_hex_colors(num_colors) end -function import_grouped_label_tree(tree_file) +function generate_hex_colors_predefined(num_colors) + + if num_colors < 1 + throw(ArgumentError("num_colors must be a positive integer")) + end + color_palette = ["#0000FF", "#FF0000", "#008000", "#FFFF00", "#800080", "#000080", "#808080"] + + if num_colors <= 7 + return color_palette[1:num_colors] + else + colors = color_palette * (num_colors // 7) + colors += color_palette[1:num_colors%7] + return colors + end +end + + + +function get_group_from_name(input_string) + group_pattern = r"\{(.+?)\}" + match_result = match(group_pattern, input_string) + if match_result != nothing + return match_result.match + else + return "" + end +end + + +function remove_internal_node_tags(tree) + # This function so we dont get double tagging in the recursive_tagging + pattern = r"\{([^}]+)\}" + for node in getnonleaflist(tree) + res = match(pattern, node.name) + if res !== nothing + node.name = replace(node.name, res.match => "") + else + node.name = node.name + end + end +end + + +function recursive_tagging(node) + if isleafnode(node) + return + end + + + left_child = recursive_tagging(node.children[1]) + right_child = recursive_tagging(node.children[2]) + + if get_group_from_name(node.children[1].name) == get_group_from_name(node.children[2].name) + group = get_group_from_name(node.children[1].name) + else + group = "" + end + node.name = node.name * group + +end + + +function remove_groups(input::AbstractString) + result = replace(input, r"\{[^}]*\}" => "") + return result +end + + +function import_grouped_labelled_newick_tree(tree_file::String, recursive_retagging::Bool=false) # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag tree = read_newick_tree(tree_file) - treestring = newick(tree) - treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) - tag_colors = generate_hex_colors(length(original_tags)) + + if recursive_retagging == false + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) + tag_colors = generate_hex_colors_predefined(length(original_tags)) + else + remove_internal_node_tags(tree) + recursive_tagging(tree) + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = CodonMolecularEvolution.replace_newick_tags(treestring) + tag_colors = CodonMolecularEvolution.generate_hex_colors_predefined(length(original_tags)) + end return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors end -export import_grouped_label_tree +export import_grouped_labelled_newick_tree function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) @@ -228,7 +305,7 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs end -function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code, optimize_branch_lengths = false) +function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code, optimize_branch_lengths=false) #Optimize mu params using a nuc model, mu denotes the nucleotide mutational biases nuc_pi = char_proportions(seqs, MolecularEvolution.nucstring) @@ -289,7 +366,7 @@ function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_t function objective(alpha=1, beta=1; tree=tree, eq_freqs=eq_freqs) return log_likelihood!(tree, build_model_vec(alpha, beta)) end - + #High tolerance estimates of alpha then beta alpha, beta = 1.0, 1.0 num_1d_optims = 3 #We'll do this many 1D optimizations, first for alpha, then beta, then alpha again and so forth @@ -337,7 +414,7 @@ function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_t yvec[wipe_index] = max_obj perm = sortperm(xvec) xvec = xvec[perm] - yvec = yvec[perm] + yvec = yvec[perm] end alpha = MolecularEvolution.new_max(xvec, yvec) iters += 1 From a30df45e674c16e59b8afbae877125873ba5bc44 Mon Sep 17 00:00:00 2001 From: patruong Date: Fri, 14 Jun 2024 07:09:31 +0000 Subject: [PATCH 09/11] more functions for newick read in --- src/shared/shared.jl | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index d413196..7d0e7df 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -238,6 +238,30 @@ end export import_grouped_labelled_newick_tree +function import_grouped_labelled_newick_tree_reroot(tree_file::String, node_name::String, dist_above_child::Float64, recursive_retagging::Bool=false) + # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag + tree = read_newick_tree(tree_file) + + node = get_node_by_name(tree, node_name) + tree = MolecularEvolution.recursive_reroot(node, dist_above_child=dist_above_child) + + if recursive_retagging == false + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = replace_newick_tags(treestring) + tag_colors = generate_hex_colors_predefined(length(original_tags)) + else + remove_internal_node_tags(tree) + recursive_tagging(tree) + treestring = newick(tree) + treestring_group_labeled, group_tags, original_tags = CodonMolecularEvolution.replace_newick_tags(treestring) + tag_colors = CodonMolecularEvolution.generate_hex_colors_predefined(length(original_tags)) + end + return treestring_group_labeled, treestring, group_tags, original_tags, tag_colors +end + +export import_grouped_labelled_newick_tree_reroot + + function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) # If there are more than 2 tags in the newick tree, we split up the tags to the tags to be analyzed and the tags to be removed (placed as background) analysis_tags = tags[tag_pos] @@ -278,7 +302,7 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge #Set up a codon partition (will default to Universal genetic code) eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) eq_partition.state .= eq_freqs - initial_partition = LazyPartition{CodonPartition}(nothing) + initial_partition = LazyPartition{CodonPartition}() populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) lazyprep!(tree, [eq_partition]) @@ -323,7 +347,7 @@ function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genet if optimize_branch_lengths == true populate_tree!(tree, eq_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) else - initial_partition = LazyPartition{NucleotidePartition}(nothing) + initial_partition = LazyPartition{NucleotidePartition}() populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) lazyprep!(tree, [eq_partition]) end @@ -362,7 +386,7 @@ function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_t #Set up a codon partition (will default to Universal genetic code) eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) eq_partition.state .= eq_freqs - initial_partition = LazyPartition{CodonPartition}(nothing) + initial_partition = LazyPartition{CodonPartition}() populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) lazyprep!(tree, [eq_partition]) From b05f0c781be21f81ed17a36573bc7b938bdc05ba Mon Sep 17 00:00:00 2001 From: patruong Date: Fri, 14 Jun 2024 07:15:07 +0000 Subject: [PATCH 10/11] update --- src/shared/shared.jl | 74 -------------------------------------------- 1 file changed, 74 deletions(-) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 0057d5f..7d0e7df 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -156,7 +156,6 @@ function generate_hex_colors_predefined(num_colors) end end -<<<<<<< HEAD function get_node_by_name(tree, nodename) mynode = [] for node in getnodelist(tree) @@ -167,9 +166,6 @@ function get_node_by_name(tree, nodename) mynode = mynode[1] return mynode end -======= - ->>>>>>> 1a08afbedc78e6648ffcb0cecbc49e12cb9125a7 function get_group_from_name(input_string) group_pattern = r"\{(.+?)\}" @@ -242,7 +238,6 @@ end export import_grouped_labelled_newick_tree -<<<<<<< HEAD function import_grouped_labelled_newick_tree_reroot(tree_file::String, node_name::String, dist_above_child::Float64, recursive_retagging::Bool=false) # Takes a Newick tree file and return Newick tree, Newick tree with replaced tags, group tags, original tags, and randomly generated colours for each tag tree = read_newick_tree(tree_file) @@ -267,8 +262,6 @@ end export import_grouped_labelled_newick_tree_reroot -======= ->>>>>>> 1a08afbedc78e6648ffcb0cecbc49e12cb9125a7 function select_analysis_tags_from_newick_tree(tags, tag_colors, tag_pos) # If there are more than 2 tags in the newick tree, we split up the tags to the tags to be analyzed and the tags to be removed (placed as background) analysis_tags = tags[tag_pos] @@ -309,11 +302,7 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge #Set up a codon partition (will default to Universal genetic code) eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) eq_partition.state .= eq_freqs -<<<<<<< HEAD initial_partition = LazyPartition{CodonPartition}() -======= - initial_partition = LazyPartition{CodonPartition}(nothing) ->>>>>>> 1a08afbedc78e6648ffcb0cecbc49e12cb9125a7 populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) lazyprep!(tree, [eq_partition]) @@ -347,7 +336,6 @@ function optimize_MG94_F3x4(seqnames, seqs, tree; leaf_name_transform=x -> x, ge #tree, alpha, beta, nuc_matrix, F3x4, eq_freqs return tree, 1.0, final_params.beta, reversibleQ(final_params.rates, ones(4)), f3x4, eq_freqs -<<<<<<< HEAD end function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code, optimize_branch_lengths=false) @@ -408,68 +396,6 @@ function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_t return GeneralCTMC(MolecularEvolution.MG94_F3x4(alpha, beta, GTRmat, F3x4, genetic_code=genetic_code)) end -======= -end - -function optimize_nuc_mu(seqnames, seqs, tree; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code, optimize_branch_lengths=false) - #Optimize mu params using a nuc model, mu denotes the nucleotide mutational biases - nuc_pi = char_proportions(seqs, MolecularEvolution.nucstring) - - eq_partition = NucleotidePartition(length(seqs[1])) - eq_partition.state .= nuc_pi - if optimize_branch_lengths == true - populate_tree!(tree, eq_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) - else - initial_partition = LazyPartition{NucleotidePartition}(nothing) - populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) - lazyprep!(tree, [eq_partition]) - end - - initial_params = positive(ones(6)) #rates must be non-negative - - flat_initial_params, unflatten = value_flatten(initial_params) #See ParameterHandling.jl docs - num_params = length(flat_initial_params) - - function build_model_vec(p; nuc_pi=nuc_pi) - return GeneralCTMC(reversibleQ(p, nuc_pi)) - end - - function objective(params; tree=tree) - return -log_likelihood!(tree, build_model_vec(params)) - end - - opt = Opt(:LN_BOBYQA, num_params) - min_objective!(opt, (x, y) -> (objective ∘ unflatten)(x)) - lower_bounds!(opt, [-5.0 for i in 1:num_params]) - upper_bounds!(opt, [5.0 for i in 1:num_params]) - xtol_rel!(opt, 1e-12) - _, mini, _ = NLopt.optimize(opt, flat_initial_params) - - final_params = unflatten(mini) - #tree, nuc mu rates, nuc pi rates - return tree, final_params, nuc_pi -end - -function optimize_codon_alpha_and_beta(seqnames, seqs, tree, GTRmat; leaf_name_transform=x -> x, genetic_code=MolecularEvolution.universal_code) - #Now we optimize alpha and beta rates using a codon model - #Count F3x4 frequencies from the seqs, and estimate codon freqs from this - f3x4 = MolecularEvolution.count_F3x4(seqs) - eq_freqs = MolecularEvolution.F3x4_eq_freqs(f3x4) - - #Set up a codon partition (will default to Universal genetic code) - eq_partition = CodonPartition(Int64(length(seqs[1]) / 3), code=genetic_code) - eq_partition.state .= eq_freqs - initial_partition = LazyPartition{CodonPartition}(nothing) - populate_tree!(tree, initial_partition, seqnames, seqs, leaf_name_transform=leaf_name_transform) - lazyprep!(tree, [eq_partition]) - - function build_model_vec(alpha, beta; F3x4=f3x4) - #Need to pass through genetic code here! - #If you run into numerical issues with DiagonalizedCTMC, switch to GeneralCTMC instead - return GeneralCTMC(MolecularEvolution.MG94_F3x4(alpha, beta, GTRmat, F3x4, genetic_code=genetic_code)) - end - ->>>>>>> 1a08afbedc78e6648ffcb0cecbc49e12cb9125a7 function objective(alpha=1, beta=1; tree=tree, eq_freqs=eq_freqs) return log_likelihood!(tree, build_model_vec(alpha, beta)) end From 2553c84355043c06b8823d9563fad837de98d94f Mon Sep 17 00:00:00 2001 From: patruong Date: Fri, 14 Jun 2024 07:26:48 +0000 Subject: [PATCH 11/11] added function for quickly checking rerooting plots --- src/shared/shared.jl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/shared/shared.jl b/src/shared/shared.jl index 7d0e7df..9914be4 100644 --- a/src/shared/shared.jl +++ b/src/shared/shared.jl @@ -282,6 +282,20 @@ end export remove_tags_from_newick_tree +function plot_tree(tree, tags, tag_colors) + ### quick plotting to check rerooting + p = sortperm(tags) + tags, tag_colors = tags[p], tag_colors[p] + push!(tag_colors, "black") + strip_tags_from_name = CodonMolecularEvolution.generate_tag_stripper(tags) + color_dict = Dict(zip(getnodelist(tree), [tag_colors[CodonMolecularEvolution.model_ind(n.name, tags)] for n in getnodelist(tree)])) + label_dict = Dict(zip(getnodelist(tree), [strip_tags_from_name(n.name) for n in getnodelist(tree)])) + + tree_draw(tree, canvas_height=(3 + length(getleaflist(tree)) / 5)cm, + draw_labels=true, dot_color_dict=color_dict, + line_color_dict=color_dict, line_width=0.3, min_dot_size=0.01, + nodelabel_dict=label_dict) +end