From 04e9841244639b9d0178ef831ad4bbcf495eeab4 Mon Sep 17 00:00:00 2001 From: sandra-selfdecode Date: Tue, 11 Nov 2025 14:18:19 -0500 Subject: [PATCH] fix: count kept variants --- pbwt/array.c | 54 ++++++++++++++++++++++++++++++----------------- pbwt/pbwtCore.c | 19 +++++++++++------ pbwt/pbwtHtslib.c | 24 ++++++++++----------- 3 files changed, 59 insertions(+), 38 deletions(-) diff --git a/pbwt/array.c b/pbwt/array.c index 515d133..156aa6f 100644 --- a/pbwt/array.c +++ b/pbwt/array.c @@ -28,7 +28,7 @@ * See header file: array.h (includes lots of macros) * HISTORY: * Last edited: Oct 8 21:56 2014 (rd) - * * Sep 19 15:41 2014 (rd): switch to long indices to avoid overflow + * * Sep 19 15:41 2014 (rd): switch to long indices to avoid overflow * * May 5 10:55 2013 (rd): change RD address to rd@sanger.ac.uk * * Feb 14 11:21 2011 (rd): modified in 2009/10 by RD for stand-alone use * Created: Thu Dec 12 15:43:25 1989 (mieg) @@ -122,13 +122,13 @@ void arrayDestroy (Array a) /**************/ -Array arrayCopy (Array a) +Array arrayCopy (Array a) { Array new ; - if (!arrayExists (a)) + if (!arrayExists (a)) die ("arrayCopy called on bad array %lx", (long unsigned int) a) ; - + new = uArrayCreate (a->dim, a->size) ; memcpy (new->base, a->base, a->dim * a->size) ; new->max = a->max ; @@ -137,7 +137,7 @@ Array arrayCopy (Array a) /******************************/ -void arrayExtend (Array a, long n) +void arrayExtend (Array a, long n) { char *new ; @@ -148,13 +148,30 @@ void arrayExtend (Array a, long n) return ; totalAllocatedMemory -= a->dim * a->size ; - if (a->dim*a->size < 1 << 26) /* 64MB */ + if (a->dim*a->size < 67108864) /* 64MB */ a->dim *= 2 ; else - a->dim += 1024 + ((1 << 26) / a->size) ; + a->dim += 1024 + (67108864 / a->size) ; if (n >= a->dim) a->dim = n + 1 ; + /* Check for integer overflow before allocation */ + /* _mycalloc takes (long number, int size), but calloc expects size_t */ + /* Check if a->dim * a->size would overflow size_t or cause calloc to fail */ + if (a->dim < 0 || a->size <= 0) + die("arrayExtend: invalid dimensions: dim=%ld, size=%d", a->dim, a->size); + /* Check for overflow: if a->dim * a->size would exceed size_t limits */ + /* Use size_t for the check to match what calloc expects */ + if (a->dim > 0) { + size_t dim_size = (size_t)a->dim; + size_t elem_size = (size_t)a->size; + size_t total_size = dim_size * elem_size; + /* Check for multiplication overflow */ + if (elem_size > 0 && total_size / elem_size != dim_size) + die("arrayExtend: size overflow: dim=%ld, size=%d (multiplication overflow)", + a->dim, a->size); + } + totalAllocatedMemory += a->dim * a->size ; new = _mycalloc (a->dim, a->size) ; @@ -211,7 +228,7 @@ BOOL arrayFind(Array a, void *s, long *ip, ArrayOrder *order) int ord ; long i = 0 , j, k ; - if (!arrayExists (a)) + if (!arrayExists (a)) die ("arrayFind called on bad array %lx", (long unsigned int) a) ; j = arrayMax(a) ; @@ -229,7 +246,7 @@ BOOL arrayFind(Array a, void *s, long *ip, ArrayOrder *order) { if (ip) *ip = j ; return FALSE ; } - + if (ord == 0) { if (ip) *ip = j ; return TRUE ; @@ -306,19 +323,19 @@ void arrayCompress(Array a) if (arrayMax(a) < 2) return ; - ab = a->base ; + ab = a->base ; as = a->size ; for (i = 1, j = 0 ; i < arrayMax(a) ; i++) { x = ab + i * as ; y = ab + j * as ; - for (k = a->size ; k-- ;) - if (*x++ != *y++) + for (k = a->size ; k-- ;) + if (*x++ != *y++) goto different ; continue ; - + different: if (i != ++j) { x = ab + i * as ; y = ab + j * as ; - for (k = a->size ; k-- ;) + for (k = a->size ; k-- ;) *y++ = *x++ ; } } @@ -342,7 +359,7 @@ void arrayReport (int j) int i ; Array a ; - fprintf(stderr, "Array report: %d created, %d active, %ld MB allocated\n", + fprintf(stderr, "Array report: %d created, %d active, %ld MB allocated\n", totalNumberCreated, totalNumberActive, totalAllocatedMemory/(1024*1024)) ; if (reportArray) @@ -357,13 +374,13 @@ void arrayReport (int j) /**************/ -void arrayStatus (int *nmadep, int *nusedp, +void arrayStatus (int *nmadep, int *nusedp, long *memAllocp, long *memUsedp) -{ +{ int i ; Array a ; - *nmadep = totalNumberCreated ; + *nmadep = totalNumberCreated ; *nusedp = totalNumberActive ; *memAllocp = totalAllocatedMemory ; *memUsedp = 0 ; @@ -376,4 +393,3 @@ void arrayStatus (int *nmadep, int *nusedp, /************************ end of file ********************************/ /**********************************************************************/ - diff --git a/pbwt/pbwtCore.c b/pbwt/pbwtCore.c index 2c3d2ed..a7e2800 100644 --- a/pbwt/pbwtCore.c +++ b/pbwt/pbwtCore.c @@ -14,7 +14,7 @@ *------------------------------------------------------------------- * Description: core functions for pbwt package * - * This is a slimmed down version of the original pbwt repository + * This is a slimmed down version of the original pbwt repository * with added functionality for getting all matches of a certain length * between target haplotypes and all haplotypes in a reference panel. *------------------------------------------------------------------- @@ -354,22 +354,27 @@ void pbwtCursorToAFend(PbwtCursor * u, PBWT * p) { PBWT * pbwtFilterSites(PBWT * pOld, Array filter) { /* Provide array of 0s and 1s to filter sites from pbwt*/ if (arrayMax(filter) != pOld->N) die("Filter is not the same size as pbwt"); - int i, j, newN; - for (j = 0; j < pOld->N; ++j) newN += * arrp(filter, j, int); + int i, j, newN = 0; + /* Count the number of sites that are kept */ + for (j = 0; j < pOld->N; j++) { + if (* arrp(filter, j, int) == 1) newN++; + } + /* If all sites are kept, return the original pbwt */ if (newN == pOld->N) return pOld; + /* Create a new pbwt with the number of sites that are kept */ PBWT * pNew = pbwtCreate(pOld->M, 0); PbwtCursor * uOld = pbwtCursorCreate(pOld, TRUE, TRUE); PbwtCursor * uNew = pbwtCursorCreate(pNew, TRUE, TRUE); uchar * yseq = myalloc(pNew->M, uchar); pNew->sites = arrayCreate(newN, Site); - - for (j = 0; j < pOld->N; ++j) { + /* Copy the sites that are kept to the new pbwt */ + for (j = 0; j < pOld->N; j++) { if (* arrp(filter, j, int) == 1) { array(pNew->sites, pNew->N++, Site) = * arrp(pOld->sites, j, Site); - for (i = 0; i < pOld->M; ++i) yseq[uOld->a[i]] = uOld->y[i]; - for (i = 0; i < pNew->M; ++i) uNew->y[i] = yseq[uNew->a[i]]; + for (i = 0; i < pOld->M; i++) yseq[uOld->a[i]] = uOld->y[i]; + for (i = 0; i < pNew->M; i++) uNew->y[i] = yseq[uNew->a[i]]; pbwtCursorWriteForwards(uNew); } pbwtCursorForwardsRead(uOld); diff --git a/pbwt/pbwtHtslib.c b/pbwt/pbwtHtslib.c index e8d79b6..9016208 100644 --- a/pbwt/pbwtHtslib.c +++ b/pbwt/pbwtHtslib.c @@ -4,7 +4,7 @@ *------------------------------------------------------------------- * Description: all the pbwt stuff that uses htslib, e.g. reading/writing vcf or bcf files * - * This is a slimmed down version of the original pbwt repository + * This is a slimmed down version of the original pbwt repository * with added functionality for getting all matches of a certain length * between target haplotypes and all haplotypes in a reference panel. *------------------------------------------------------------------- @@ -36,7 +36,7 @@ static int variation (PBWT *p, const char *ref, const char *alt) { static int buflen = 0 ; if (!buf) { buflen = 64 ; buf = myalloc (buflen, char) ; } int var ; - if (strlen (ref) + strlen (alt) + 2 > buflen) + if (strlen (ref) + strlen (alt) + 2 > buflen) { do buflen *= 2 ; while (strlen (ref) + strlen (alt) + 2 > buflen) ; free (buf) ; buf = myalloc (buflen, char) ; } @@ -61,23 +61,23 @@ PBWT *pbwtReadVcfGT (char *filename) { uchar *xMissing = myalloc(p->M+1, uchar) ; xMissing[p->M] = Y_SENTINEL; /* needed for efficient packing */ long nMissing = 0; - int nMissingSites = 0; + int nMissingSites = 0; int mgt_arr = 0, *gt_arr = NULL; - while (bcf_sr_next_line (sr)) + while (bcf_sr_next_line (sr)) { bcf1_t *line = bcf_sr_get_line(sr,0) ; const char* chrom = bcf_seqname(hr,line) ; if (!p->chrom) p->chrom = strdup (chrom) ; else if (strcmp (chrom, p->chrom)) break ; int pos = line->pos + 1 ; // bcf coordinates are 0-based - char *ref, *REF; + char *ref, *REF; ref = REF = strdup(line->d.allele[0]); while ( (*ref = toupper(*ref)) ) ++ref ; // get a copy of GTs int ngt = bcf_get_genotypes(hr, line, >_arr, &mgt_arr) ; if (ngt <= 0) continue ; // it seems that -1 is used if GT is not in the FORMAT - if (ngt != p->M && p->M != 2*ngt) die ("%d != %d GT values at %s:%d - not haploid or diploid?", + if (ngt != p->M && p->M != 2*ngt) die ("%d != %d GT values at %s:%d - not haploid or diploid?", ngt, p->M, chrom, pos) ; memset (xMissing, 0, p->M) ; @@ -102,14 +102,14 @@ PBWT *pbwtReadVcfGT (char *filename) { else { for (i = 0 ; i < p->M ; i++) - { if (gt_arr[i] == bcf_int32_vector_end) + { if (gt_arr[i] == bcf_int32_vector_end) die ("unexpected end of genotype vector in VCF") ; if (gt_arr[i] == bcf_gt_missing) { x[i] = 0 ; /* use ref for now */ xMissing[i] = 1 ; ++nMissing ; } - else + else x[i] = bcf_gt_allele(gt_arr[i]) ; // convert from BCF binary to 0 or 1 } } @@ -121,7 +121,7 @@ PBWT *pbwtReadVcfGT (char *filename) { /* not in the REF/ALT site */ for (i = 1 ; i < n_allele ; i++) { - char *alt, *ALT; + char *alt, *ALT; alt = ALT = no_alt ? "." : strdup(line->d.allele[i]); if (!no_alt) while ( (*alt = toupper(*alt)) ) ++alt; @@ -155,12 +155,12 @@ PBWT *pbwtReadVcfGT (char *filename) { if (gt_arr) free (gt_arr) ; bcf_sr_destroy (sr) ; - free (x) ; pbwtCursorDestroy (u) ; + free (x) ; pbwtCursorDestroy (u) ; free (xMissing) ; - fprintf(logFile, " [pbwt]: Read genotypes from %s with %ld haplotypes and %ld sites on chromosome %s\n", + fprintf(logFile, " [pbwt]: Read genotypes from %s with %d haplotypes and %d sites on chromosome %s\n", filename, p->M, p->N, p->chrom); - if (p->missingOffset) fprintf (logFile, " [pbwt]: %ld missing values at %d sites\n", + if (p->missingOffset) fprintf (logFile, " [pbwt]: %ld missing values at %d sites\n", nMissing, nMissingSites) ; return p ;