Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions concordance/src/checker/checker_parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ void checker::declare_options() {
bpo::options_description opt_input ("Input parameters");
opt_input.add_options()
("input", bpo::value< std::string >(), "File with four columns listing in order: regions frequencies validation and imputed dataset. For genome-wide concordance, add more lines specifying different chromosomes.")
("samples", bpo::value< std::string >(), "List of samples to process, one sample ID per line")
("gt-val", "Uses hard called genotypes rather than phread-scaled likelihoods for the validation dataset, reading them from FORMAT/GT field.")
("samples", bpo::value< std::string >(), "List of samples to process. One sample per line, giving the sample ID as it appears in the imputed VCF. Each row may optionally include a second whitespace-separated column giving the corresponding sample ID in the validation VCF (useful when the same biological sample appears under different IDs, e.g. multiple downsampled replicates of the same truth sample). The second column is optional on a per-row basis; a single file may freely mix rows with and without an alias. When omitted, the validation-side ID is assumed identical to the imputed-side ID.")
("gt-val", "Uses hard called genotypes rather than phred-scaled likelihoods for the validation dataset, reading them from FORMAT/GT field.")
("gt-tar", "Uses FORMAT/GT field to determine the best-guess genotype rather than the FORMAT/GP (default). FORMAT/DS are FORMAT/GP fields are still required for calibration and rsquared calculations.");


Expand Down
11 changes: 10 additions & 1 deletion concordance/src/containers/call_set_header.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,17 @@ class call_set {
int N, L, D;
float T;
bool use_subset_samples;
std::set < std::string > subset_samples_set;
std::vector < std::string > subset_samples;
// Truth-side sample ID per row of subset_samples. When --samples provides
// only one column for a row, the entry equals the imputed-side ID. When
// --samples is not provided, this is populated alongside subset_samples
// with identical values during readData initialisation.
std::vector < std::string > subset_samples_truth;
// For each row i of subset_samples, the column index within the truth-subset
// FORMAT arrays (e.g. dp_arr_t[imputed_to_truth_col[i]],
// pl_arr_t[ploidy_t_record * imputed_to_truth_col[i]]). Multiple imputed
// rows share a column when they alias the same truth sample.
std::vector < int > imputed_to_truth_col;

std::vector < std::string > samples;
std::vector < double > bins;
Expand Down
24 changes: 20 additions & 4 deletions concordance/src/containers/call_set_management.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,30 @@ void call_set::setTargets(std::string fsamples) {
std::string buffer;
input_file fd (fsamples);
std::vector < std::string > tokens;
std::pair<std::set<std::string>::iterator,bool> ret;
// Map imputed-id -> truth-id. Used to detect duplicate imputed rows that
// disagree on the truth-side alias.
std::map < std::string, std::string > imputed_to_truth;

while (getline(fd, buffer))
{
if (stb.split(buffer, tokens) < 1) vrb.error("Empty line found in sample file.");
ret = subset_samples_set.insert(tokens[0]);
if (ret.second) subset_samples.push_back(tokens[0]);
const int n_tok = stb.split(buffer, tokens);
if (n_tok < 1) vrb.error("Empty line found in sample file.");
if (n_tok > 2) vrb.error("Too many columns in sample file (expected 1 or 2): " + buffer);

const std::string & imputed_id = tokens[0];
const std::string & truth_id = (n_tok == 2) ? tokens[1] : tokens[0];

auto ins = imputed_to_truth.insert({imputed_id, truth_id});
if (ins.second)
{
subset_samples.push_back(imputed_id);
subset_samples_truth.push_back(truth_id);
}
else if (ins.first->second != truth_id)
{
vrb.error("Sample [" + imputed_id + "] listed twice in --samples file with conflicting truth-side IDs [" + ins.first->second + "] and [" + truth_id + "]");
}
// else: duplicate row with consistent truth ID -- silently dedupe.
}
vrb.bullet("#samples = " + stb.str(subset_samples.size()));
fd.close();
Expand Down
145 changes: 107 additions & 38 deletions concordance/src/containers/call_set_reading.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,14 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
int n_true_samples, n_esti_samples;
unsigned long int n_variants_all_chromosomes = 0;
//std::vector < int > mappingT, mappingE;
char** isec_samples_names = NULL;
int* truth_imap;
int* estimated_imap;
int n_isec_samples_names = 0;
// Imputed-side: one entry per imputed sample (length N).
char** estimated_subset_names = NULL;
int n_estimated_subset_names = 0;
int* estimated_imap = NULL;
// Truth-side: one entry per *unique* truth alias (length K, possibly < N).
char** truth_subset_names = NULL;
int n_truth_subset_names = 0;
int* truth_imap = NULL;

std::string af_tag = options["af-tag"].as<std::string>();
const bool gt_validation = options.count("gt-val");
Expand Down Expand Up @@ -107,41 +111,81 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
if (f == 0)
{
std::unordered_set<std::string> truth_samples_names_set;
std::unordered_set<std::string> isec_samples_names_set;
std::unordered_set<std::string> esti_samples_names_set;

int n_true_samples_file = bcf_hdr_nsamples(sr->readers[0].header);
int n_esti_samples_file = bcf_hdr_nsamples(sr->readers[1].header);
for (int i = 0 ; i < n_true_samples_file ; i ++) truth_samples_names_set.insert(sr->readers[0].header->samples[i]);
for (int i = 0 ; i < n_esti_samples_file ; i ++)
if (truth_samples_names_set.find(sr->readers[1].header->samples[i]) != truth_samples_names_set.end())
isec_samples_names_set.insert(sr->readers[1].header->samples[i]);
for (int i = 0 ; i < n_esti_samples_file ; i ++) esti_samples_names_set.insert(sr->readers[1].header->samples[i]);

// No --samples: take the imputed∩truth intersection 1:1 (truth ID == imputed ID for every row).
if (!options.count("samples")) {
for (int i = 0 ; i < n_esti_samples_file ; i ++) {
const std::string esti_id = sr->readers[1].header->samples[i];
if (truth_samples_names_set.count(esti_id)) {
subset_samples.push_back(esti_id);
subset_samples_truth.push_back(esti_id);
}
}
}

if (!options.count("samples")) subset_samples.assign(isec_samples_names_set.begin(),isec_samples_names_set.end());
for (int i = 0; i < subset_samples.size() ; i ++)
{
if (isec_samples_names_set.find(subset_samples[i]) != isec_samples_names_set.end())
{
isec_samples_names = (char**) realloc(isec_samples_names, (n_isec_samples_names+1)*sizeof(const char*));
isec_samples_names[n_isec_samples_names] = strdup(subset_samples[i].c_str());
n_isec_samples_names++;
// Filter rows whose imputed ID is missing from the imputed VCF or whose
// (possibly aliased) truth ID is missing from the truth VCF.
std::vector<std::string> kept_imputed, kept_truth;
for (size_t i = 0; i < subset_samples.size(); i++) {
if (esti_samples_names_set.count(subset_samples[i]) &&
truth_samples_names_set.count(subset_samples_truth[i])) {
kept_imputed.push_back(subset_samples[i]);
kept_truth.push_back(subset_samples_truth[i]);
}
}
if (n_isec_samples_names == 0) vrb.error("No sample in common between datasets");
subset_samples.swap(kept_imputed);
subset_samples_truth.swap(kept_truth);
if (subset_samples.empty()) vrb.error("No sample in common between datasets");

// Build the estimated-side name array: one entry per imputed row (unique by construction).
n_estimated_subset_names = subset_samples.size();
estimated_subset_names = (char**) malloc(n_estimated_subset_names * sizeof(char*));
for (int i = 0; i < n_estimated_subset_names; i++)
estimated_subset_names[i] = strdup(subset_samples[i].c_str());

// Build the truth-side unique-name array and the imputed -> truth-column mapping.
std::unordered_map<std::string,int> truth_alias_to_col;
imputed_to_truth_col.clear();
imputed_to_truth_col.reserve(subset_samples_truth.size());
for (size_t i = 0; i < subset_samples_truth.size(); i++) {
const std::string & tid = subset_samples_truth[i];
auto it = truth_alias_to_col.find(tid);
if (it == truth_alias_to_col.end()) {
int col = (int) truth_alias_to_col.size();
truth_alias_to_col[tid] = col;
imputed_to_truth_col.push_back(col);
} else {
imputed_to_truth_col.push_back(it->second);
}
}
n_truth_subset_names = (int) truth_alias_to_col.size();
truth_subset_names = (char**) malloc(n_truth_subset_names * sizeof(char*));
for (auto & kv : truth_alias_to_col)
truth_subset_names[kv.second] = strdup(kv.first.c_str());

truth_imap = (int*)malloc(n_isec_samples_names * sizeof(int));
estimated_imap = (int*)malloc(n_isec_samples_names * sizeof(int));
truth_imap = (int*) malloc(n_truth_subset_names * sizeof(int));
estimated_imap = (int*) malloc(n_estimated_subset_names * sizeof(int));

hdr_truth = bcf_hdr_subset(sr->readers[0].header, n_isec_samples_names, isec_samples_names, truth_imap);
hdr_estimated = bcf_hdr_subset(sr->readers[1].header, n_isec_samples_names, isec_samples_names, estimated_imap);
hdr_truth = bcf_hdr_subset(sr->readers[0].header, n_truth_subset_names, truth_subset_names, truth_imap);
hdr_estimated = bcf_hdr_subset(sr->readers[1].header, n_estimated_subset_names, estimated_subset_names, estimated_imap);

if ( !hdr_truth || !hdr_estimated || bcf_hdr_nsamples(hdr_truth) != bcf_hdr_nsamples(hdr_estimated) || bcf_hdr_nsamples(hdr_truth)<=0) vrb.error("Error occurred while subsetting truth samples");
if (!hdr_truth || !hdr_estimated
|| bcf_hdr_nsamples(hdr_truth) != n_truth_subset_names
|| bcf_hdr_nsamples(hdr_estimated) != n_estimated_subset_names)
vrb.error("Error occurred while subsetting samples");

n_true_samples = bcf_hdr_nsamples(hdr_truth);
n_esti_samples = bcf_hdr_nsamples(hdr_estimated);
n_true_samples = bcf_hdr_nsamples(hdr_truth); // = K (unique truth IDs)
n_esti_samples = bcf_hdr_nsamples(hdr_estimated); // = N (imputed rows)

N=n_true_samples;
N=n_esti_samples;
samples.reserve(N);
for (int i=0; i<N;++i) samples.push_back(std::string(hdr_truth->samples[i]));
for (int i=0; i<N;++i) samples.push_back(std::string(hdr_estimated->samples[i]));

/*
////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -200,7 +244,7 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
int n_gt_fields = 0;

bcf1_t * line = bcf_sr_get_line(sr, 1);
bcf_subset(hdr_estimated, line, n_isec_samples_names, estimated_imap);
bcf_subset(hdr_estimated, line, n_estimated_subset_names, estimated_imap);
int ngt = bcf_get_genotypes(hdr_estimated, line, &gt_fields, &n_gt_fields);
const int line_max_ploidy = ngt/n_esti_samples;
assert(line_max_ploidy==ploidy); //we do not allow missing data
Expand Down Expand Up @@ -342,8 +386,12 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
}
else
{
hdr_truth = bcf_hdr_subset(sr->readers[0].header, n_isec_samples_names, isec_samples_names, truth_imap);
hdr_estimated = bcf_hdr_subset(sr->readers[1].header, n_isec_samples_names, isec_samples_names, estimated_imap);
hdr_truth = bcf_hdr_subset(sr->readers[0].header, n_truth_subset_names, truth_subset_names, truth_imap);
hdr_estimated = bcf_hdr_subset(sr->readers[1].header, n_estimated_subset_names, estimated_subset_names, estimated_imap);
if (!hdr_truth || !hdr_estimated
|| bcf_hdr_nsamples(hdr_truth) != n_truth_subset_names
|| bcf_hdr_nsamples(hdr_estimated) != n_estimated_subset_names)
vrb.error("Error occurred while subsetting samples in [" + ftruth[f] + "] / [" + festimated[f] + "] (a sample present in the first input file is missing here)");
}

const int ploidyP1 = ploidy + 1;
Expand Down Expand Up @@ -434,8 +482,8 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
line_t = bcf_sr_get_line(sr, 0);
line_e = bcf_sr_get_line(sr, 1);

bcf_subset(hdr_truth, line_t, n_isec_samples_names, truth_imap);
bcf_subset(hdr_estimated, line_e, n_isec_samples_names, estimated_imap);
bcf_subset(hdr_truth, line_t, n_truth_subset_names, truth_imap);
bcf_subset(hdr_estimated, line_e, n_estimated_subset_names, estimated_imap);

naf_f = bcf_get_info_float(sr->readers[2].header, line_f, af_tag.c_str(), &af_ptr, &naf_arr_f);

Expand Down Expand Up @@ -480,6 +528,22 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
const int ploidy_e_record=ngp_e/n_esti_samples;
const int ploidy_t_record=npl_t/n_true_samples;

// The truth-side per-sample stride must match the imputed-side
// ploidy: ploidy entries per sample under --gt-val (one allele each),
// otherwise ploidy+1 PL entries per sample (one per genotype). When
// these disagree the per-record reads below would silently walk into
// the next sample's slot. This is reachable when --samples aliases
// an imputed sample to a validation sample of different ploidy.
const int expected_t_stride = gt_validation ? ploidy : ploidyP1;
if (ploidy_t_record != expected_t_stride)
vrb.error("Validation and imputed VCFs disagree on ploidy at "
+ std::string(bcf_hdr_id2name(hdr_truth, line_t->rid))
+ ":" + stb.str(line_t->pos + 1)
+ " (imputed ploidy=" + stb.str(ploidy)
+ ", validation per-sample stride=" + stb.str(ploidy_t_record)
+ ", expected " + stb.str(expected_t_stride)
+ "). Aliased samples in --samples must share ploidy with their validation-side counterparts.");

const bool flip = use_alt_af? false : (af > 0.5f);
const float maf = use_alt_af? af : std::min(af, 1.0f - af);
int grp_bin = -1;
Expand Down Expand Up @@ -515,10 +579,13 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
if (grp_bin >= 0)
{ // Do this variant fall within a given frequency bin?

// Read Truth
for(int i = 0 ; i < n_true_samples ; i ++)
// Iterate over imputed rows (N) rather than truth columns (K):
// when K < N, multiple imputed rows alias the same truth column
// and each needs its own per-row write into GLs/DPs below.
for(int i = 0 ; i < N ; i ++)
{
const int index = ploidy_t_record*i;//ploidy
const int t_col = imputed_to_truth_col[i];
const int index = ploidy_t_record*t_col;
const int gpos=ind2gpos[i];

if (gt_validation)
Expand All @@ -540,7 +607,7 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
GLs[gpos+1] = (gt == 1);
GLs[gpos+ploidy] = gt == ploidy;
}
DPs[i] = D > 0 ? dp_arr_t[i] : 0;
DPs[i] = D > 0 ? dp_arr_t[t_col] : 0;
}
else
{
Expand All @@ -558,7 +625,7 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
GLs[gpos+1] = unphred[std::min(pl_arr_t[index+1],255)];
GLs[gpos+ploidy] = unphred[std::min(pl_arr_t[index+ploidy],255)];
}
DPs[i] = D > 0 ? dp_arr_t[i] : 0;
DPs[i] = D > 0 ? dp_arr_t[t_col] : 0;
}
if (flip) { float_swap = GLs[gpos+ploidy]; GLs[gpos+ploidy] = GLs[gpos+0]; GLs[gpos+0] = float_swap; }
}
Expand Down Expand Up @@ -898,8 +965,10 @@ void call_set::readData(std::vector < std::string > & ftruth, std::vector < std:
}
free(truth_imap);
free(estimated_imap);
for(int i = 0; i < n_isec_samples_names; i++) free(isec_samples_names[i]);
if (isec_samples_names) free(isec_samples_names);
for (int i = 0; i < n_estimated_subset_names; i++) free(estimated_subset_names[i]);
if (estimated_subset_names) free(estimated_subset_names);
for (int i = 0; i < n_truth_subset_names; i++) free(truth_subset_names[i]);
if (truth_subset_names) free(truth_subset_names);

vrb.print("");
vrb.bullet("Total #variants = " + stb.str(n_variants_all_chromosomes));
Expand Down
21 changes: 19 additions & 2 deletions docs/docs/documentation/concordance.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@ GLIMPSE2_concordance --gt-val --ac-bins 1 5 10 20 50 100 200 500 1000 2000 5000
```
</div>

#### Sample aliasing

By default, samples are matched between the imputed VCF and the validation VCF by exact ID. When the same biological sample appears under different IDs in the two files — for example, the same truth sample imputed at multiple downsampling depths — pass a `--samples` file with an optional second column that maps each imputed-side ID to the corresponding truth-side ID. Rows without a second column behave as before.

<div class="code-example" markdown="1">
```
# samples.txt — first column = imputed ID, optional second column = validation ID
NA12878.0_5x NA12878
NA12878.1x NA12878
NA12878.2x NA12878
HG00096
HG00097
```
</div>

The three NA12878 rows alias to the single truth sample `NA12878`; `HG00096` and `HG00097` use the same ID in both files and need no alias. Per-sample output rows are labelled with the imputed-side ID.

---

### Command line options
Expand All @@ -46,8 +63,8 @@ GLIMPSE2_concordance --gt-val --ac-bins 1 5 10 20 50 100 200 500 1000 2000 5000
| Option name | Argument| Default | Description |
|:---------------------|:--------|:---------|:-------------------------------------|
| \-\-input | FILE | NA | File with four columns listing in order: regions frequencies validation and imputed dataset. For genome-wide concordance, add more lines specifying different chromosomes. |
| \-\-samples | NA | NA | List of samples to process, one sample ID per line. |
| \-\-gt-val | NA | NA | Uses hard called genotypes rather than phread-scaled likelihoods for the validation dataset, reading them from FORMAT/GT field. |
| \-\-samples | FILE | NA | List of samples to process. One sample per line, giving the imputed-VCF sample ID; an optional second whitespace-separated column gives the corresponding validation-VCF sample ID. See [Sample aliasing](#sample-aliasing) above for the full description and an example. |
| \-\-gt-val | NA | NA | Uses hard called genotypes rather than phred-scaled likelihoods for the validation dataset, reading them from FORMAT/GT field. |
| \-\-gt-tar | NA | NA | Uses FORMAT/GT field to determine the best-guess genotype rather than the FORMAT/GP (default). FORMAT/DS are FORMAT/GP fields are still required for calibration and rsquared calculations. |


Expand Down