Skip to content

Commit

Permalink
Fix CRAM embed_ref=2 with seqs overlapping ref end.
Browse files Browse the repository at this point in the history
If the sequences align off the end of the reference and we are
creating consensus on the fly, then the consensus generated also steps
beyond the reference length.  Although this longer reference is
embedded, it is trimmed back by the CRAM decoder which validates
against the declared reference length in SQ LN, leading to Ns
appearing in the decoder.

Therefore we now validate in the encoder too, which also needed
refs_from_header updates to parse the LN tag so the encoder can trim.
Note we already overloaded r->length==0 for an indication that we've
not parsed the fa/fai file yet, so we can't just naively fill this out
from the SQ LN header.  We could hold this information elsewhere via a
proper flag and modify all the places that utilise that knowledge, but
the simplest (and safest) fix is to have a separate variable used for
this one specific case.

An example of failure could be seen in:

    ./test/test_view -C -o embed_ref=2 test/c1#bounds.sam |\
    ./test/test_view -
  • Loading branch information
jkbonfield committed Oct 9, 2024
1 parent 2ff207b commit 65a2687
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 5 deletions.
4 changes: 3 additions & 1 deletion cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1761,7 +1761,6 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
c->ref_start = ref_start+1;
c->ref_end = ref_end+1;
c->ref_free = 1;

return 0;

err:
Expand Down Expand Up @@ -1997,6 +1996,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
fd->no_ref_counter -= (fd->no_ref_counter > 0);
pthread_mutex_unlock(&fd->ref_lock);
}

if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
}

// Iterate through records creating the cram blocks for some
Expand Down
8 changes: 5 additions & 3 deletions cram/cram_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2787,10 +2787,12 @@ static int refs_from_header(cram_fd *fd) {

/* Initialise likely filename if known */
if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) {
if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
//fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
}

if ((tag = sam_hrecs_find_key(ty, "LN", NULL)))
// LN tag used when constructing consensus reference
r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
}

k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
Expand Down
3 changes: 2 additions & 1 deletion cram/cram_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,8 @@ struct cram_slice {
typedef struct ref_entry {
char *name;
char *fn;
int64_t length;
int64_t length; // if 0 this indicates we haven't loaded it yet
int64_t LN_length; // @SQ LN length, used to trim consensus ref
int64_t offset;
int bases_per_line;
int line_length;
Expand Down
8 changes: 8 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,14 @@ sub test_view
testv $opts, "./test_view $tv_args -C -p $ercram $ersam";
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

$ersam = "c1#bounds.sam";
$ercram = "c1#bounds_er.tmp.cram";
$ersam2 = "${ercram}.sam";
testv $opts, "./test_view $tv_args -C -p $ercram $ersam";
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

if ($test_view_failures == 0) {
passed($opts, "embed_ref=2 tests");
} else {
Expand Down

0 comments on commit 65a2687

Please sign in to comment.