ContextSV v1.0.0
Loading...
Searching...
No Matches
cnv_caller.h
Go to the documentation of this file.
1// CNVCaller: Detect CNVs and return the state sequence by SNP position
2// (key = [chromosome, SNP position], value = state)
3
4#ifndef CNV_CALLER_H
5#define CNV_CALLER_H
6
7#include "khmm.h"
8#include "input_data.h"
9#include "sv_types.h"
10#include "sv_object.h"
11#include "utils.h"
12
14#include <string>
15#include <vector>
16#include <unordered_map>
17#include <set>
18// #include <mutex>
19#include <shared_mutex>
20#include <future>
21
23
24using namespace sv_types;
25
26// SNP data is a struct containing vectors used in predicting copy number
27// states. It is sorted by SNP position.
28struct SNPData {
29 std::vector<uint32_t> pos;
30 std::vector<double> pfb;
31 std::vector<double> baf;
32 std::vector<double> log2_cov;
33 std::vector<int> state_sequence; // Predicted copy number states
34 std::vector<bool> is_snp; // True if the position is a SNP
35 double mean_chr_cov = 0; // Mean coverage for the chromosome
36
38 pos({}),\
39 pfb({}), \
40 baf({}), \
41 log2_cov({}), \
42 state_sequence({}), \
43 is_snp({}), \
44 mean_chr_cov(0) {}
45};
46
47// CNVCaller: Detect CNVs and return the state sequence by SNP position
48class CNVCaller {
49 private:
50 std::shared_mutex& shared_mutex;
51
52 void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
53
54 void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
55
56 // Query a region for SNPs and return the SNP data
57 void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
58
59 // Split a region into chunks for parallel processing
60 std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
61
62 public:
63 CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {}
64
65 // Define a map of CNV genotypes by HMM predicted state.
66 // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
67 // Each of the 6 state predictions corresponds to a copy number state
68 // (0=No predicted state)
69 // 0: Unknown (No predicted state)
70 // 1: 1/1 (Two copy loss: homozygous deletion, GT: 1/1 for homozygous variant)
71 // 2: 0/1 (One copy loss: heterozygous deletion, GT: 0/1)
72 // 3: 0/0 (Normal diploid: no copy number change, GT: 0/0 for homozygous reference)
73 // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1 for homozygous variant)
74 // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1)
75 // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1)
76 const std::unordered_map<int, Genotype> StateGenotypeMap = {
77 {0, Genotype::UNKNOWN},
78 {1, Genotype::HOMOZYGOUS_ALT},
79 {2, Genotype::HETEROZYGOUS},
80 {3, Genotype::HOMOZYGOUS_REF},
81 {4, Genotype::HOMOZYGOUS_ALT},
82 {5, Genotype::HETEROZYGOUS},
83 {6, Genotype::HOMOZYGOUS_ALT}
84 };
85
86 // Function to get the genotype string from the state
87 inline Genotype getGenotypeFromCNState(int cn_state) const {
88 // return StateGenotypeMap.at(cn_state);
89 try {
90 return StateGenotypeMap.at(cn_state);
91 } catch (const std::out_of_range& e) {
92 printError("ERROR: Invalid CN state: " + std::to_string(cn_state));
93 return Genotype::UNKNOWN;
94 }
95 }
96
97 // Run copy number prediction for a single SV candidate, returning the
98 // likelihood, predicted CNV type, genotype, and whether SNPs were found
99 std::tuple<double, SVType, Genotype, int> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
100
101 // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
102 void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
103
104 void calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const;
105
106 void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb, const InputData& input_data) const;
107
108 // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
109 void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
110
111 void saveSVCopyNumberToJSON(SNPData& before_sv, SNPData& after_sv, SNPData& snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const;
112};
113
114#endif // CNV_CALLER_H
Definition cnv_caller.h:48
Genotype getGenotypeFromCNState(int cn_state) const
Definition cnv_caller.h:87
void saveSVCopyNumberToTSV(SNPData &snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const
Definition cnv_caller.cpp:820
void saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SNPData &snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string &filepath) const
Definition cnv_caller.cpp:902
void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector< uint32_t > &snp_pos, std::unordered_map< uint32_t, double > &snp_baf, std::unordered_map< uint32_t, double > &snp_pfb, const InputData &input_data) const
Definition cnv_caller.cpp:567
void calculateMeanChromosomeCoverage(const std::vector< std::string > &chromosomes, std::unordered_map< std::string, std::vector< uint32_t > > &chr_pos_depth_map, std::unordered_map< std::string, double > &chr_mean_cov_map, const std::string &bam_filepath, int thread_count) const
Definition cnv_caller.cpp:424
std::tuple< double, SVType, Genotype, int > runCopyNumberPrediction(std::string chr, const CHMM &hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector< uint32_t > &pos_depth_map, const InputData &input_data) const
Definition cnv_caller.cpp:166
CNVCaller(std::shared_mutex &shared_mutex)
Definition cnv_caller.h:63
void runCIGARCopyNumberPrediction(std::string chr, std::vector< SVCall > &sv_candidates, const CHMM &hmm, double mean_chr_cov, const std::vector< uint32_t > &pos_depth_map, const InputData &input_data) const
Definition cnv_caller.cpp:299
const std::unordered_map< int, Genotype > StateGenotypeMap
Definition cnv_caller.h:76
Definition input_data.h:22
Definition sv_types.h:13
Genotype
Definition sv_types.h:50
Definition cnv_caller.h:28
SNPData()
Definition cnv_caller.h:37
std::vector< double > log2_cov
Definition cnv_caller.h:32
std::vector< bool > is_snp
Definition cnv_caller.h:34
std::vector< uint32_t > pos
Definition cnv_caller.h:29
double mean_chr_cov
Definition cnv_caller.h:35
std::vector< double > pfb
Definition cnv_caller.h:30
std::vector< double > baf
Definition cnv_caller.h:31
std::vector< int > state_sequence
Definition cnv_caller.h:33
void printError(std::string message)
Definition utils.cpp:88