Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions pbwt/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
* See header file: array.h (includes lots of macros)
* HISTORY:
* Last edited: Oct 8 21:56 2014 (rd)
* * Sep 19 15:41 2014 (rd): switch to long indices to avoid overflow
* * Sep 19 15:41 2014 (rd): switch to long indices to avoid overflow
* * May 5 10:55 2013 (rd): change RD address to rd@sanger.ac.uk
* * Feb 14 11:21 2011 (rd): modified in 2009/10 by RD for stand-alone use
* Created: Thu Dec 12 15:43:25 1989 (mieg)
Expand Down Expand Up @@ -122,13 +122,13 @@ void arrayDestroy (Array a)

/**************/

Array arrayCopy (Array a)
Array arrayCopy (Array a)
{
Array new ;

if (!arrayExists (a))
if (!arrayExists (a))
die ("arrayCopy called on bad array %lx", (long unsigned int) a) ;

new = uArrayCreate (a->dim, a->size) ;
memcpy (new->base, a->base, a->dim * a->size) ;
new->max = a->max ;
Expand All @@ -137,7 +137,7 @@ Array arrayCopy (Array a)

/******************************/

void arrayExtend (Array a, long n)
void arrayExtend (Array a, long n)
{
char *new ;

Expand All @@ -148,13 +148,30 @@ void arrayExtend (Array a, long n)
return ;

totalAllocatedMemory -= a->dim * a->size ;
if (a->dim*a->size < 1 << 26) /* 64MB */
if (a->dim*a->size < 67108864) /* 64MB */
a->dim *= 2 ;
else
a->dim += 1024 + ((1 << 26) / a->size) ;
a->dim += 1024 + (67108864 / a->size) ;
if (n >= a->dim)
a->dim = n + 1 ;

/* Check for integer overflow before allocation */
/* _mycalloc takes (long number, int size), but calloc expects size_t */
/* Check if a->dim * a->size would overflow size_t or cause calloc to fail */
if (a->dim < 0 || a->size <= 0)
die("arrayExtend: invalid dimensions: dim=%ld, size=%d", a->dim, a->size);
/* Check for overflow: if a->dim * a->size would exceed size_t limits */
/* Use size_t for the check to match what calloc expects */
if (a->dim > 0) {
size_t dim_size = (size_t)a->dim;
size_t elem_size = (size_t)a->size;
size_t total_size = dim_size * elem_size;
/* Check for multiplication overflow */
if (elem_size > 0 && total_size / elem_size != dim_size)
die("arrayExtend: size overflow: dim=%ld, size=%d (multiplication overflow)",
a->dim, a->size);
}

totalAllocatedMemory += a->dim * a->size ;

new = _mycalloc (a->dim, a->size) ;
Expand Down Expand Up @@ -211,7 +228,7 @@ BOOL arrayFind(Array a, void *s, long *ip, ArrayOrder *order)
int ord ;
long i = 0 , j, k ;

if (!arrayExists (a))
if (!arrayExists (a))
die ("arrayFind called on bad array %lx", (long unsigned int) a) ;

j = arrayMax(a) ;
Expand All @@ -229,7 +246,7 @@ BOOL arrayFind(Array a, void *s, long *ip, ArrayOrder *order)
{ if (ip) *ip = j ;
return FALSE ;
}

if (ord == 0)
{ if (ip) *ip = j ;
return TRUE ;
Expand Down Expand Up @@ -306,19 +323,19 @@ void arrayCompress(Array a)
if (arrayMax(a) < 2)
return ;

ab = a->base ;
ab = a->base ;
as = a->size ;
for (i = 1, j = 0 ; i < arrayMax(a) ; i++)
{ x = ab + i * as ; y = ab + j * as ;
for (k = a->size ; k-- ;)
if (*x++ != *y++)
for (k = a->size ; k-- ;)
if (*x++ != *y++)
goto different ;
continue ;

different:
if (i != ++j)
{ x = ab + i * as ; y = ab + j * as ;
for (k = a->size ; k-- ;)
for (k = a->size ; k-- ;)
*y++ = *x++ ;
}
}
Expand All @@ -342,7 +359,7 @@ void arrayReport (int j)
int i ;
Array a ;

fprintf(stderr, "Array report: %d created, %d active, %ld MB allocated\n",
fprintf(stderr, "Array report: %d created, %d active, %ld MB allocated\n",
totalNumberCreated, totalNumberActive, totalAllocatedMemory/(1024*1024)) ;

if (reportArray)
Expand All @@ -357,13 +374,13 @@ void arrayReport (int j)

/**************/

void arrayStatus (int *nmadep, int *nusedp,
void arrayStatus (int *nmadep, int *nusedp,
long *memAllocp, long *memUsedp)
{
{
int i ;
Array a ;

*nmadep = totalNumberCreated ;
*nmadep = totalNumberCreated ;
*nusedp = totalNumberActive ;
*memAllocp = totalAllocatedMemory ;
*memUsedp = 0 ;
Expand All @@ -376,4 +393,3 @@ void arrayStatus (int *nmadep, int *nusedp,

/************************ end of file ********************************/
/**********************************************************************/

19 changes: 12 additions & 7 deletions pbwt/pbwtCore.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
*-------------------------------------------------------------------
* Description: core functions for pbwt package
*
* This is a slimmed down version of the original pbwt repository
* This is a slimmed down version of the original pbwt repository
* with added functionality for getting all matches of a certain length
* between target haplotypes and all haplotypes in a reference panel.
*-------------------------------------------------------------------
Expand Down Expand Up @@ -354,22 +354,27 @@ void pbwtCursorToAFend(PbwtCursor * u, PBWT * p) {
PBWT * pbwtFilterSites(PBWT * pOld, Array filter) {
/* Provide array of 0s and 1s to filter sites from pbwt*/
if (arrayMax(filter) != pOld->N) die("Filter is not the same size as pbwt");
int i, j, newN;
for (j = 0; j < pOld->N; ++j) newN += * arrp(filter, j, int);
int i, j, newN = 0;
/* Count the number of sites that are kept */
for (j = 0; j < pOld->N; j++) {
if (* arrp(filter, j, int) == 1) newN++;
}
/* If all sites are kept, return the original pbwt */
if (newN == pOld->N) return pOld;

/* Create a new pbwt with the number of sites that are kept */
PBWT * pNew = pbwtCreate(pOld->M, 0);
PbwtCursor * uOld = pbwtCursorCreate(pOld, TRUE, TRUE);
PbwtCursor * uNew = pbwtCursorCreate(pNew, TRUE, TRUE);
uchar * yseq = myalloc(pNew->M, uchar);

pNew->sites = arrayCreate(newN, Site);

for (j = 0; j < pOld->N; ++j) {
/* Copy the sites that are kept to the new pbwt */
for (j = 0; j < pOld->N; j++) {
if (* arrp(filter, j, int) == 1) {
array(pNew->sites, pNew->N++, Site) = * arrp(pOld->sites, j, Site);
for (i = 0; i < pOld->M; ++i) yseq[uOld->a[i]] = uOld->y[i];
for (i = 0; i < pNew->M; ++i) uNew->y[i] = yseq[uNew->a[i]];
for (i = 0; i < pOld->M; i++) yseq[uOld->a[i]] = uOld->y[i];
for (i = 0; i < pNew->M; i++) uNew->y[i] = yseq[uNew->a[i]];
pbwtCursorWriteForwards(uNew);
}
pbwtCursorForwardsRead(uOld);
Expand Down
24 changes: 12 additions & 12 deletions pbwt/pbwtHtslib.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*-------------------------------------------------------------------
* Description: all the pbwt stuff that uses htslib, e.g. reading/writing vcf or bcf files
*
* This is a slimmed down version of the original pbwt repository
* This is a slimmed down version of the original pbwt repository
* with added functionality for getting all matches of a certain length
* between target haplotypes and all haplotypes in a reference panel.
*-------------------------------------------------------------------
Expand Down Expand Up @@ -36,7 +36,7 @@ static int variation (PBWT *p, const char *ref, const char *alt) {
static int buflen = 0 ;
if (!buf) { buflen = 64 ; buf = myalloc (buflen, char) ; }
int var ;
if (strlen (ref) + strlen (alt) + 2 > buflen)
if (strlen (ref) + strlen (alt) + 2 > buflen)
{ do buflen *= 2 ; while (strlen (ref) + strlen (alt) + 2 > buflen) ;
free (buf) ; buf = myalloc (buflen, char) ;
}
Expand All @@ -61,23 +61,23 @@ PBWT *pbwtReadVcfGT (char *filename) {
uchar *xMissing = myalloc(p->M+1, uchar) ;
xMissing[p->M] = Y_SENTINEL; /* needed for efficient packing */
long nMissing = 0;
int nMissingSites = 0;
int nMissingSites = 0;

int mgt_arr = 0, *gt_arr = NULL;
while (bcf_sr_next_line (sr))
while (bcf_sr_next_line (sr))
{ bcf1_t *line = bcf_sr_get_line(sr,0) ;
const char* chrom = bcf_seqname(hr,line) ;
if (!p->chrom) p->chrom = strdup (chrom) ;
else if (strcmp (chrom, p->chrom)) break ;
int pos = line->pos + 1 ; // bcf coordinates are 0-based
char *ref, *REF;
char *ref, *REF;
ref = REF = strdup(line->d.allele[0]);
while ( (*ref = toupper(*ref)) ) ++ref ;

// get a copy of GTs
int ngt = bcf_get_genotypes(hr, line, &gt_arr, &mgt_arr) ;
if (ngt <= 0) continue ; // it seems that -1 is used if GT is not in the FORMAT
if (ngt != p->M && p->M != 2*ngt) die ("%d != %d GT values at %s:%d - not haploid or diploid?",
if (ngt != p->M && p->M != 2*ngt) die ("%d != %d GT values at %s:%d - not haploid or diploid?",
ngt, p->M, chrom, pos) ;

memset (xMissing, 0, p->M) ;
Expand All @@ -102,14 +102,14 @@ PBWT *pbwtReadVcfGT (char *filename) {
else
{
for (i = 0 ; i < p->M ; i++)
{ if (gt_arr[i] == bcf_int32_vector_end)
{ if (gt_arr[i] == bcf_int32_vector_end)
die ("unexpected end of genotype vector in VCF") ;
if (gt_arr[i] == bcf_gt_missing)
{ x[i] = 0 ; /* use ref for now */
xMissing[i] = 1 ;
++nMissing ;
}
else
else
x[i] = bcf_gt_allele(gt_arr[i]) ; // convert from BCF binary to 0 or 1
}
}
Expand All @@ -121,7 +121,7 @@ PBWT *pbwtReadVcfGT (char *filename) {
/* not in the REF/ALT site */
for (i = 1 ; i < n_allele ; i++)
{
char *alt, *ALT;
char *alt, *ALT;
alt = ALT = no_alt ? "." : strdup(line->d.allele[i]);
if (!no_alt) while ( (*alt = toupper(*alt)) ) ++alt;

Expand Down Expand Up @@ -155,12 +155,12 @@ PBWT *pbwtReadVcfGT (char *filename) {

if (gt_arr) free (gt_arr) ;
bcf_sr_destroy (sr) ;
free (x) ; pbwtCursorDestroy (u) ;
free (x) ; pbwtCursorDestroy (u) ;
free (xMissing) ;

fprintf(logFile, " [pbwt]: Read genotypes from %s with %ld haplotypes and %ld sites on chromosome %s\n",
fprintf(logFile, " [pbwt]: Read genotypes from %s with %d haplotypes and %d sites on chromosome %s\n",
filename, p->M, p->N, p->chrom);
if (p->missingOffset) fprintf (logFile, " [pbwt]: %ld missing values at %d sites\n",
if (p->missingOffset) fprintf (logFile, " [pbwt]: %ld missing values at %d sites\n",
nMissing, nMissingSites) ;

return p ;
Expand Down