diff --git a/.gitignore b/.gitignore index f5ea8b8..dbce267 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,162 @@ -*.pyc -*.pdf +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + .DS_Store diff --git a/PIPE-CLIP_manual.pdf b/PIPE-CLIP_manual.pdf deleted file mode 100644 index 3a32a72..0000000 Binary files a/PIPE-CLIP_manual.pdf and /dev/null differ diff --git a/Python.gitignore b/Python.gitignore new file mode 100644 index 0000000..68bc17f --- /dev/null +++ b/Python.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/README.md b/README.md index 64b0060..19e6813 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,48 @@ -PIPE-CLIP -========= -This is version 1.1.0 +# PIPE-CLIP + +[![Pypi Releases](https://img.shields.io/pypi/v/pipeclip.svg)](https://pypi.python.org/pypi/pipeclip) +[![Downloads](https://pepy.tech/badge/pipeclip)](https://pepy.tech/project/pipeclip) + Pipeline for CLIP-seq Analysis. -- Galaxy site: the galaxy service is discontinued in 2019. -- Publication: http://genomebiology.com/2014/15/1/R18 -- Google Code site: https://code.google.com/p/pipe-clip/ - - -Requirement: -- Python 2.7; -- R 3.0 and above; -- Perl 5 and above; -- Python packages: pysam, pybedtools and ghmm; -- R packages: MASS,VGAM and their dependencies. -- Other packages: HOMER and annotation files - - -Installation tips: -- Make sure HOMER are in your PATH. You can test this by type "annotatePeaks.pl" from anywhere and you should get help information of this command. - -How to use: - -- After unzip the package, you cd into the program folder and run PIPE-CLIP by typing: -- python pipeclip.py -i input.bam -o output_prefix -c CLIP_type -l minimum_matchlength -m maximum_mismatchcount -r Remove_PCR_duplicate -M FDR_for_mutations -C FDR_for_clusters -s species - -- -i input BAM -- -o output prefix -- -c CLIP type,[0,1,2,3] (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP -- -l minimum match length -- -m maximum mismatch count -- -r method to remove PCR duplicate,[0,1,2] (0)No removal; (1)Remove by read start; (2)Remove by sequence -- -M FDR to get significant mutations -- -C FDR to get enriched clusters -- -s species -Here, species might be hg19. - -Contact: Zhiqun.Xie@UTSouthwestern.edu +This is a fork version (2.0.x) of [PIPE-CLIP](https://github.com/QBRC/PIPE-CLIP). + +## Requirement: + +- Python >=3.6; +- Python packages: `pysam`, `pybedtools` and `rpy2`. (Python packages will be installed automaticallly) + +- R >=3.0; +- R packages: `MASS`, `VGAM` and their dependencies. (R packages will be installed automatically) + +- Perl >=5.0 +- Other packages: `HOMER` (annotatePeaks.pl) and annotation files + - Make sure HOMER are in your PATH. You can test this by type "annotatePeaks.pl" from anywhere and you should get help information of this command. + +## How to install: + +```bash +pip install pipeclip +``` + +## How to use: + +```bash +pipeclip -i input.bam -o output_prefix -c CLIP_type -l minimum_matchlength -m maximum_mismatchcount -r Remove_PCR_duplicate -M FDR_for_mutations -C FDR_for_clusters -s species +``` + +- `-i` input BAM +- `-t` control BAM +- `-o` output prefix +- `-c` CLIP type,[0,1,2,3] (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP +- `-r` method to remove PCR duplicate,[0,1,2] (0)No removal; (1)Remove by read start; (2)Remove by sequence +- `-l` minimum match length +- `-m` maximum mismatch count +- `-M` FDR to get significant mutations +- `-C` FDR to get enriched clusters +- `-s` species. (species might be hg19, mm10, mm9.) Leave blank to skip annotation step. + +## Footnote + +> Cite: +> +> - Chen, B., Yun, J., Kim, M.S. et al. PIPE-CLIP: a comprehensive online tool for CLIP-seq data analysis. Genome Biol 15, R18 (2014). https://doi.org/10.1186/gb-2014-15-1-r18 diff --git a/lib/Alignment.py b/lib/Alignment.py deleted file mode 100644 index 4b1336f..0000000 --- a/lib/Alignment.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/python -# programmer: beibei.chen@utsouthwestern.edu -# Usage: definition of alignment files including BED and BAM - -import gzip -import Enrich - -class BED(): - def __init__(self,chr,start,stop,name,score,strand): - self.chr = chr - self.start = int(start) - self.stop = int(stop) - self.name = name - self.score = int(score) - self.strand = strand - - def __str__(self): - st = "\t".join([str(self.chr),str(self.start),str(self.stop),self.name,str(self.score),self.strand]) - return st - - def merge(self,read): - self.stop = read.stop - self.score += 1 - - def overlap(self,read): - if self.chr == read.chr and self.strand == read.strand: - if self.start <= read.stop and self.stop >=read.start: - return True - else: - return False - - def updateScore(self,s): - self.score = int(s) - - def increaseScore(self): - self.score += 1 - -class ClusterBed(BED): - def __init__(self,chr,start,stop,name,score,strand): - self.pvalue = 0 - self.qvalue = 0 - self.sig = False - BED.__init__(self,chr,start,stop,name,score,strand) - -class MutationBed(BED): - def __init__(self,chr,start,stop,name,score,strand,type): - self.type = type#insertion,deletion,type of substitution - self.kvalue = 0 - BED.__init__(self,chr,start,stop,name,score,strand) - self.pvalue = 0 - self.qvalue = 0 - self.sig = False - - def updateK(self,k): - self.kvalue = k - - def __str__(self): - st = BED.__str__(self) - st += "\t"+self.type+"\t"+str(self.kvalue) - return st - -class CrosslinkingBed(BED): - def __init__(self,chr,start,stop,name,score,strand,clusterP,clusterQ,mStart,mName,mP): - self.fisherP = 0 - self.clusterP = float(clusterP) - self.mutationP = [mP] - self.qvalue = float(clusterQ) - self.mutationStarts = [str(mStart)] - self.mutationNames = [mName] - BED.__init__(self,chr,start,stop,name,score,strand) - - def addMutation(self,mu):#mu is a instance - self.mutationNames.append(mu.name) - self.mutationStarts.append(str(mu.start)) - self.mutationP.append(mu.pvalue) - - def fishertest(self): - fp = Enrich.fisherTest(self.clusterP,self.mutationP) - self.fisherP = fp - - -class wiglist: - def __init__(self): - self.pos = [] - self.value = [] - - def valueByPos(self,p): - try: - return self.value[self.pos.index(p)] - except: - return False - - def update(self,p,v): - self.pos.append(p) - self.value.append(v) - diff --git a/lib/CLIP.py b/lib/CLIP.py deleted file mode 100644 index b49327f..0000000 --- a/lib/CLIP.py +++ /dev/null @@ -1,568 +0,0 @@ -'''Define CLIP class''' -import sys -import gzip -import copy -import logging -import math -import pysam -import random -import Utils -import Alignment -import Mutation2 -import OptValidator -import subprocess -import datetime -import time -import gc -import os - -gc.enable() -OptValidator.opt_validate() - - -class CLIP: - def __init__(self,fileaddr,prefix): - self.filepath = fileaddr - self.originalBAM = None - self.originalMapped = 0 - self.refInfo = [] - self.posfilteredBAM = None - self.negfilteredBAM = None - self.filteredAlignment = 0 - self.type = 0 - self.outprefix = prefix - self.currentGroupKey = "None" - self.currentGroup = [] #Used in rmdup - self.previousQul = [0,0,0]#for rmdup,[matchlen,mapq,mismatch] - self.clusters = [] - self.clusters_plus = [] - self.clusters_minus = [] - self.clusterCount = 0 - self.currentCluster_plus = Alignment.BED("",0,0,"",0,"+") - self.currentCluster_minus = Alignment.BED("",0,0,"",0,"-") - #self.sigClusters = {} - self.mutations = {} #Dictionary of bed instance - self.mutationCount = 0 - self.mutationList = [] - self.sigMutations = {}#same as sigClusters - self.kmpair = {} - self.sigMutationCount = 0 - self.sigClusterCount = 0 - self.coverage = 0 #"reads coverage of this sample" - self.bamheader = None - self.crosslinking = {} - self.crosslinkingMutations = [] - - def __str__(self): - pass - - def testInput(self): - '''Test the input file format, modify self.filepath and return bool - Status: True:file is ready to use; False: wrong file, program stop - ''' - #test if file has header - try: - self.header = pysam.view("-H",self.filepath) - except: - try: - self.header = pysam.view("-SH",self.filepath) - except: - logging.error("Input file does not have header, please check your file. Program quit") - return (False,"None") - #Header test passed, test if it is BAM - try: - infile = gzip.open(self.filepath) - infile.readline(10) - except:#cannot read line, should be sam - logging.info("Input is SAM, converting to BAM...") - bamout = ".".join(self.filepath.split(".")[0:-1])+"."+"bam" - infile = pysam.Samfile(self.filepath,"r",header=self.header) - #print >> sys.stderr,pysam.view("-SH",infile) - outfile = pysam.Samfile(bamout,"wb",template=infile) - for i in infile.fetch(): - outfile.write(i) - self.filepath = bamout - #Now the infile is BAM,check if it is sorted - if Utils.is_sorted(self.header): - pysam.index(self.filepath) - return True - else:#sort the BAM - logging.info("Input is not sorted, sorting file...") - bamsort = ".".join(self.filepath.split(".")[0:-1])+"."+"sort" - pysam.sort(self.filepath,bamsort) - pysam.index(bamsort+".bam") - self.filepath = bamsort+".bam" # change input file path - self.header = pysam.view("-H",bamsort+".bam") - logging.info("Input file sorted") - #if Utils.is_sorted(self.header): - # print >> sys.stderr, "The file is sorted" - return True - - def readfile(self): - try: - self.originalBAM = pysam.Samfile(self.filepath,"rb") - self.refInfo = zip(self.originalBAM.references,self.originalBAM.lengths) - return True - except IOError,message: - logging.error("Cannot open input file") - return False - -# def printFilteredReads(self): -# for i in self.filteredAlignment: -# print i - - def printClusters(self): - outfile = open(self.outprefix+".clusters.bed","w") - for i in [self.clusters_plus,self.clusters_minus]: - for j in i: - print >>outfile,j - outfile.close() - - def printMutations(self): - outfile = open(self.outprefix+".mutations.bed","w") - for i in self.mutations.values(): - print >>outfile,i - outfile.close() - - - def printMutations_chr(self): - fhs = {}#file handle dic, key is chr, value is file handle - for chr,chrlen in self.refInfo: - outfile = open(self.outprefix+"."+chr+".mutations.bed","w") - fhs[chr] = outfile - for i in self.mutations.itervalues(): - print >>fhs[i.chr],i - #clean up - for fh in fhs.itervalues(): - fh.close() - del fhs - - - def printMutationSingleChr(self,chr): - outfile = open(self.outprefix+"."+chr+".mutations.bed","w") - for i in self.mutations.itervalues(): - print >>outfile,i - #clean up - outfile.close() - - def printReliableMutations(self): - outfile = open(self.outprefix+".reliableMutations.pipeclip.bed","w") - header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr" - print >> outfile,header - self.printEnrichedItem(self.sigMutations,outfile) - - - def printEnrichedClusters(self): - outfile = open(self.outprefix+".enrichedClusters.pipeclip.bed","w") - header = "#chr\tstart\tstop\tcluster_name\tread_count\tstrand\tp_value\tfdr" - print >> outfile,header - self.printReliableList(self.clusters,outfile) - return [self.outprefix+".enrichedClusters.pipeclip.bed"] - - def printCrosslinkingMutations(self): - outfile = open(self.outprefix+".crosslinkingMutations.pipeclip.bed","w") - header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr" - print >> outfile,header - self.printReliableList(self.crosslinkingMutations,outfile) - - - def printReliableList(self,mylist,fh): - for i in mylist: - if i.sig: - st = i.__str__() - st += "\t"+str(i.pvalue)+"\t"+str(i.qvalue) - print >>fh,st - - def printList(self,mylist,fh): - for i in mylist: - print >>fh,i - - def printEnrichedItem(self,dic,fh): - for k in dic.keys(): - for i in dic[k]: - st = i.__str__() - st += "\t"+str(i.pvalue)+"\t"+str(i.qvalue) - print >> fh,st - - def printCrosslinkingSites(self): - output = self.outprefix - header = "#chr\tstart\tstop\tcluster_name\treads_count\tstrand\tcluster_fdr\tcrosslinking_fisherP\tmutation_pos\tmutation_name" - if self.type == 0:#HITS-CLIP, three output - output_del = open(output+"_deletion_crosslinking.pipeclip.txt","w") - output_sub = open(output+"_substitution_crosslinking.pipeclip.txt","w") - output_ins = open(output+"_insertion_crosslinking.pipeclip.txt","w") - print >> output_del,header - print >> output_sub,header - print >> output_ins,header - else: - output_name = open(output+"_crosslinking.pipeclip.txt","w") - print >> output_name,header - for k in self.crosslinking.keys(): - st = self.crosslinking[k].__str__() - st += "\t"+"\t".join([str(self.crosslinking[k].qvalue),str(self.crosslinking[k].fisherP)]) - st += "\t"+",".join(self.crosslinking[k].mutationStarts) - st += "\t"+",".join(self.crosslinking[k].mutationNames) - if self.type == 0: - output_key = k.split("_")[-1] - if output_key == "Deletion": - print >> output_del,st - elif output_key == "Insertion": - print >> output_ins,st - elif output_key == "Substitution": - print >> output_sub,st - else: - print >> output_name,st - - if self.type == 0: - output_del.close() - output_sub.close() - output_ins.close() - return [output+"_insertion_crosslinking.pipeclip.txt",output+"_deletion_crosslinking.pipeclip.txt",output+"_substitution_crosslinking.pipeclip.txt"] - else: - output_name.close() - return [output+"_crosslinking.pipeclip.txt"] - - - def updatePreviousQul(self,n,q,m): - self.previousQul[0] = n - self.previousQul[1] = q - self.previousQul[2] = m - - - def updateCurrentGroup(self,read,mlen,mis): - '''Compare read to current duplication group parameters, determine to add to, to drop or to replace, make sure duplication group only has reads with best quality''' - if mlen >= self.previousQul[0] and read.mapq >= self.previousQul[1] and mis <= self.previousQul[2]:# consider to append or replace only when read has no worse quality - if mlen > self.previousQul[0] or read.mapq > self.previousQul[1] or mis < self.previousQul[2]:# read has better quality,replace - self.currentGroup = [read] - self.updatePreviousQul(mlen,read.mapq,mis) - else: - self.currentGroup.append(read) - - def iniDupGroupInfo(self,read,group_key,mlength,mismatch): - self.currentGroupKey = group_key - self.currentGroup = [read] - self.updatePreviousQul(mlength,read.mapq,mismatch) - - def rmdup(self): - '''Return random one read of highest quality from list''' - #print "get one from group" - if len(self.currentGroup)==1: - #print self.currentGroup[0] - return self.currentGroup[0] - else: - random.seed(1) - index = random.randint(0,len(self.currentGroup)-1) - #print self.currentGroup[index] - return self.currentGroup[index] - - def updateCluster(self,read): - '''Cluster new read to known clusters and update cluster reads count''' - strandDic = {"True":"-","False":"+"} - clusterName = "cluster"+"_"+str(len(self.clusters_plus)+len(self.clusters_minus)+1) - newRead = Alignment.ClusterBed(self.originalBAM.getrname(read.tid),read.pos,read.pos+len(read.seq),clusterName,1,strandDic[str(read.is_reverse)]) - if read.is_reverse: - if self.currentCluster_minus.chr == "": #Initiate cluster - self.currentCluster_minus = newRead - self.clusters_minus.append(self.currentCluster_minus) - self.clusterCount += 1 - else: - if self.currentCluster_minus.overlap(newRead): - self.currentCluster_minus.merge(newRead) - self.clusters_minus[-1]=self.currentCluster_minus - else:#New read is a new cluster - #self.clusters.append(self.currentCluster) - self.currentCluster_minus = newRead - self.clusters_minus.append(self.currentCluster_minus) - self.clusterCount+=1 - else: - if self.currentCluster_plus.chr == "": #Initiate cluster - self.currentCluster_plus = newRead - self.clusters_plus.append(self.currentCluster_plus) - self.clusterCount+=1 - else: - if self.currentCluster_plus.overlap(newRead): - self.currentCluster_plus.merge(newRead) - self.clusters_plus[-1]=self.currentCluster_plus - else:#New read is a new cluster - #self.clusters.append(self.currentCluster) - self.currentCluster_plus = newRead - self.clusters_plus.append(self.currentCluster_plus) - self.clusterCount += 1 - - - def updateMutation(self,read,mis): - mutations = [] - if self.type == 3:#iclip,find truncation - mutations = Mutation2.getTruncations(self.originalBAM,read) - else: - mutations = Mutation2.getMutations(self.originalBAM,read) - if self.type ==1: - mutation_filter = Utils.filterMutations(mutations,"T->C",True) - mutations = mutation_filter - elif self.type ==2: - mutation_filter = Utils.filterMutations(mutations,"G->A",True) - mutations = mutation_filter - #logging.debug("read %s " % read) - if len(mutations)>0: - for m in mutations: - #print m - self.mutationCount += 1 - m_key = "_".join([m.chr,str(m.start),m.strand,m.type]) - if self.mutations.has_key(m_key): - self.mutations[m_key].increaseScore() - else: - self.mutations[m_key]=m - - def updateCLIPinfo(self,read,matchlen,miscount): - '''Update sample coverage info, clustering, mutation info''' - #logging.debug("read %s, cigar %s,mismatch %d" % (read.qname,read.cigar,miscount)) - #update sample coverage info - self.coverage += matchlen - #update cluster info - self.updateCluster(read) - #update mutation info - if miscount > 0: - self.updateMutation(read,miscount) - - def addSigToDic(self,dic,mu): - '''Add new mutation into the dictionary.Mutations should be sorted''' - if dic.has_key(mu.chr): - dic[mu.chr].append(mu) - else: - dic[mu.chr] = [mu] - - def getCrosslinking(self): - '''Merge enriched clusters and reliable mutations together - Call Enrich.fisherTest() to calculate joint p vlaue - ''' - for cluster in self.clusters: - #logging.debug("P value of cluster is %f" % cluster.pvalue) - if cluster.sig and self.sigMutations.has_key(cluster.chr): - for mutation in self.sigMutations[cluster.chr]: - #logging.debug("Cluster loc %d,%d ; Mutation loc %d,%d, mutation type %s" % (cluster.start,cluster.stop,mutation.start,mutation.stop,mutation.type)) - if cluster.overlap(mutation): - #logging.debug("Overlapped") - if self.type == 0:#HITS-CLIP - mutation_key = mutation.type.split("->")[0] - if mutation_key in ["A","C","G","T"]: - mutation_key = "Substitution" - cross_key = cluster.name+"_"+mutation_key - else: - cross_key = cluster.name - if self.crosslinking.has_key(cross_key): - #logging.debug("Existing mutation pvalue:",self.crosslinking[cluster.name].mutationP) - self.crosslinking[cross_key].addMutation(mutation) - self.crosslinkingMutations.append(mutation) - else: - self.crosslinking[cross_key] = Alignment.CrosslinkingBed(cluster.chr,cluster.start,cluster.stop,cluster.name,cluster.score,cluster.strand,cluster.pvalue,cluster.qvalue,mutation.start,mutation.name,mutation.pvalue) - self.crosslinkingMutations.append(mutation) - #start to calculate fisher test p value - for k in self.crosslinking.iterkeys(): - self.crosslinking[k].fishertest() - - - def filter(self,matchLen,mismatch,cliptype,duprm): - '''Filter the input BAM file according to parameters. Make clusters and mutations at the same time''' - logging.info("Start to filter alignment using parameters:") - logging.info("match length:%d" % (matchLen)) - logging.info("mismatch count: %d" % (mismatch)) - logging.info("CLIP type:%s" % (str(cliptype))) - logging.info("Rmdup code:%s" % (str(duprm))) - logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped)) - #print >> sys.stderr,zip(self.originalBAM.references,self.originalBAM.lengths) - #sys_info = os.system("free -m") - #logging.debug(sys_info) - #outBAM = pysam.Samfile(outprefix+".filtered.bam","wb",template=self.originalBAM) - self.originalMapped = self.originalBAM.mapped - outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM) - outBAM_neg = pysam.Samfile(self.outprefix+".neg.filtered.bam","wb",template=self.originalBAM) - self.type = cliptype - if cliptype == 3:#make sure there is no rmdup for iCLIP data - duprm = 0 - count = 0 - start_time = datetime.datetime.now() - for alignment in self.originalBAM: - #print "Now processing",alignment.qname - if not alignment.cigar : #reads is unmapped - continue - count += 1 - if count % 500000 ==0: - stop_time = datetime.datetime.now() - logging.debug("Processed %d reads in %s" % (count,str(stop_time-start_time))) - start_time = stop_time - flag,mlen,mis = Utils.readQuaFilter(alignment,matchLen,mismatch) - if flag: - #print "Qualified read" - #print alignment - #print "current Gourp key",self.currentGroupKey - if duprm > 0: - #get group key - if duprm == 1: - groupkey = Utils.rmdupKey_Start(alignment) - elif duprm == 2: - groupkey = Utils.rmdupKey_Seq(alignment) - #check current group - if groupkey == self.currentGroupKey:#overlap with current group, update group - self.updateCurrentGroup(alignment,mlen,mis) - else:#get read from current group and discard it, use current read to start a new current group - if self.currentGroupKey!="None":#current group exists - keep = self.rmdup() - #logging.debug("Pop out read to keep %s" % keep) - self.currentGroup = [] - self.filteredAlignment += 1 - flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) - self.updateCLIPinfo(keep,mlen,mis) - #outBAM.write(keep) - if keep.is_reverse: - outBAM_neg.write(keep) - else: - outBAM_pos.write(keep) - self.iniDupGroupInfo(alignment,groupkey,mlen,mis)#make new group using current alignment - else:#there is no rmdup - #logging.debug("Good read, update clip info %s" % read.qname) - self.filteredAlignment+=1 - self.updateCLIPinfo(alignment,mlen,mis) - #outBAM.write(alignment) - if alignment.is_reverse: - outBAM_neg.write(alignment) - else: - outBAM_pos.write(alignment) - #clean up the final dupGroup, if rmdup==0, there is no final dupGroup - if len(self.currentGroup)>0: - keep = self.rmdup() - self.currentGroup = [] - self.filteredAlignment+=1 - flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) - self.updateCLIPinfo(keep,mlen,mis) - #outBAM.write(alignment) - if keep.is_reverse: - outBAM_neg.write(keep) - else: - outBAM_pos.write(keep) - #Logging CLIP sample information - #outBAM.close() - outBAM_pos.close() - outBAM_neg.close() - #pysam.index(outprefix+".filtered.bam") - pysam.index(self.outprefix+".pos.filtered.bam") - pysam.index(self.outprefix+".neg.filtered.bam") - self.posfilteredBAM = self.outprefix+".pos.filtered.bam" - self.negfilteredBAM = self.outprefix+".neg.filtered.bam" - self.clusters = self.clusters_plus + self.clusters_minus - self.printClusters() - self.printMutations_chr() - #Clean up variables - self.originalBAM = None - self.clusters_plus = None - self.clusters_minus = None - self.mutations = None - gc.collect() - logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) - logging.debug("There are %d clusters in total" % (self.clusterCount)) - logging.debug("There are %d mutations in total" % (self.mutationCount)) - - - def filter2(self,matchLen,mismatch,cliptype,duprm): - '''Filter the input BAM file according to parameters. Make clusters and mutations at the same time''' - logging.info("Start to filter alignment using parameters:") - logging.info("match length:%d" % (matchLen)) - logging.info("mismatch count: %d" % (mismatch)) - logging.info("CLIP type:%s" % (str(cliptype))) - logging.info("Rmdup code:%s" % (str(duprm))) - logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped)) - - self.originalMapped = self.originalBAM.mapped - outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM) - outBAM_neg = pysam.Samfile(self.outprefix+".neg.filtered.bam","wb",template=self.originalBAM) - self.type = cliptype - if cliptype == 3:#make sure there is no rmdup for iCLIP data - duprm = 0 - count = 0 - start_time = datetime.datetime.now() - currentChr = "" - for alignment in self.originalBAM: - if not alignment.cigar : #reads is unmapped - continue - c_chr = self.originalBAM.getrname(alignment.tid) - if currentChr == "": - currentChr = c_chr - elif c_chr!=currentChr: - #logging.debug("Mutation sites for Current ref %s is %d" % (currentChr,len(self.mutations.keys()))) - self.printMutationSingleChr(currentChr) - self.mutations = None - gc.collect() - self.mutations = {} - currentChr = c_chr - count += 1 - if count % 5000000 ==0: - stop_time = datetime.datetime.now() - logging.debug("Processed %d reads in %s" % (count,str(stop_time-start_time))) - start_time = stop_time - flag,mlen,mis = Utils.readQuaFilter(alignment,matchLen,mismatch) - if flag: - if duprm > 0: - #get group key - if duprm == 1: - groupkey = Utils.rmdupKey_Start(alignment) - elif duprm == 2: - groupkey = Utils.rmdupKey_Seq(alignment) - #check current group - if groupkey == self.currentGroupKey:#overlap with current group, update group - self.updateCurrentGroup(alignment,mlen,mis) - else:#get read from current group and discard it, use current read to start a new current group - if self.currentGroupKey!="None":#current group exists - keep = self.rmdup() - #logging.debug("Pop out read to keep %s" % keep) - self.currentGroup = [] - self.filteredAlignment += 1 - flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) - self.updateCLIPinfo(keep,mlen,mis) - #outBAM.write(keep) - if keep.is_reverse: - outBAM_neg.write(keep) - else: - outBAM_pos.write(keep) - self.iniDupGroupInfo(alignment,groupkey,mlen,mis)#make new group using current alignment - else:#there is no rmdup - #logging.debug("Good read, update clip info %s" % read.qname) - self.filteredAlignment+=1 - self.updateCLIPinfo(alignment,mlen,mis) - #outBAM.write(alignment) - if alignment.is_reverse: - outBAM_neg.write(alignment) - else: - outBAM_pos.write(alignment) - #clean up the final dupGroup, if rmdup==0, there is no final dupGroup - if len(self.currentGroup)>0: - keep = self.rmdup() - self.currentGroup = [] - self.filteredAlignment+=1 - flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) - self.updateCLIPinfo(keep,mlen,mis) - #outBAM.write(alignment) - if keep.is_reverse: - outBAM_neg.write(keep) - else: - outBAM_pos.write(keep) - #clean up current mutation hash - self.printMutationSingleChr(currentChr) - self.mutations = None - #Logging CLIP sample information - #outBAM.close() - outBAM_pos.close() - outBAM_neg.close() - pysam.index(self.outprefix+".pos.filtered.bam") - pysam.index(self.outprefix+".neg.filtered.bam") - self.posfilteredBAM = self.outprefix+".pos.filtered.bam" - self.negfilteredBAM = self.outprefix+".neg.filtered.bam" - self.clusters = self.clusters_plus + self.clusters_minus - self.printClusters() - #Clean up variables - self.originalBAM = None - self.clusters_plus = None - self.clusters_minus = None - self.mutations = None - gc.collect() - logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) - logging.debug("There are %d clusters in total" % (self.clusterCount)) - logging.debug("There are %d mutations in total" % (self.mutationCount)) diff --git a/lib/Enrich.py b/lib/Enrich.py deleted file mode 100755 index 89c322c..0000000 --- a/lib/Enrich.py +++ /dev/null @@ -1,428 +0,0 @@ -#!/usr/bin/python -# Programmer : beibei.chen@utsouthwestern.edu -# Usage: Get reliable mutations using binomial distribution -# Input: Filtered BAM, reads coverage (generated by SAMFilter.py), mutation file -# Output: BED -# Last modified: 18 Sep.2013 - - -import sys -import re -import random -import string -import logging -import pysam -from pysam import * -import argparse as ap -from pybedtools import BedTool -import copy -import rpy2.robjects as robject -from rpy2.robjects.packages import importr -from rpy2.robjects import FloatVector -import math -from collections import Counter -import subprocess -import OptValidator -import datetime -import time -import Utils -import os -import gc -import Alignment -import os - -OptValidator.opt_validate() -gc.enable() - -stats = importr('stats') - -def freqRank(readCount,rev=False): - key = sorted(readCount.keys(),reverse=rev) - r_rank = {} - rank = 0 - for i in key: - rank += readCount[i] - r_rank[i] =rank - return r_rank - -def BH(pvalue,pRank,N): - a = N/float(pRank) - q = a * pvalue - qva = max(pvalue, q) - return qva - -#@profile -def KMvalue(posmapfile,negmapfile,mufile): - ''' - Calculate K(coverage) value for each mutation location - Mutations are already unique. - ''' - km = []#store mutations with updated k value - km_pair = {}#Dic of count tuples of (k,m),key:"K_M" - count = 0 - logging.debug("make wig %s" % str(datetime.datetime.now())) - poswig = Utils.makeWig(posmapfile) - negwig = Utils.makeWig(negmapfile) - start_time = datetime.datetime.now() - logging.debug("finish making wig %s" % str(start_time)) - for item in mufile: - count += 1 - if count % 5000 == 0: - stop_time = datetime.datetime.now() - logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) - start_time = stop_time - st = [] - strand = item.strand - M = item.score - K = 0 - #logging.debug("Time begin to pileup is %s" % (str(datetime.datetime.now()))) - if strand == "+": - try: - K = poswig[item.chr][str(item.start)] - except: - continue - - elif strand == "-": - try: - K = negwig[item.chr][str(item.start)] - except: - continue - if K>=M: - item.updateK(K) - #print item - #logging.debug("K value for item %s is %d" % (item, K)) - pair_name = str(K)+"_"+str(M) - if km_pair.has_key(pair_name): - km_pair[pair_name] += 1 - else: - km_pair[pair_name] = 1 - #km.append(item) - gc.collect() - return km_pair - -def KMvalue_test(clip,mutations,chr,chrlen): - ''' - Calculate K(coverage) value for each mutation location - Mutations are already unique. - ''' - km = []#store mutations with updated k value - count = 0 - #logging.debug("make wig %s" % str(datetime.datetime.now())) - posBAM = pysam.Samfile(clip.posfilteredBAM,"rb") - negBAM = pysam.Samfile(clip.negfilteredBAM,"rb") - start_time = datetime.datetime.now() - poswig = Utils.makeWigByChr(posBAM,chr) - negwig = Utils.makeWigByChr(negBAM,chr) - stop_time = datetime.datetime.now() - #logging.debug("Finished making wig for %s using %s" % (chr,str(stop_time-start_time))) - start_time = stop_time - for item in mutations: - count += 1 - if count % 100000 == 0: - stop_time = datetime.datetime.now() - logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) - start_time = stop_time - st = [] - M = item.score - K = 0 - strand = item.strand - if strand == "+": - try: - K = poswig[item.start] - except: - #log.warning("Did not find mutation in poswig") - continue - elif strand == "-": - try: - K = negwig[item.start] - except: - continue - if K>=M: - item.updateK(K) - pair_name = str(K)+"_"+str(M) - if clip.kmpair.has_key(pair_name): - clip.kmpair[pair_name] += 1 - else: - clip.kmpair[pair_name] = 1 - posBAM.close() - negBAM.close() - -def uniq(b): #b is a list - uniqElements = [] - for i in b: - if uniqElements.count(i)==0: - uniqElements.append(i) - uniqElements.sort() - return uniqElements - -def mutationEnrich(clip,threshold=0.01): - coverage = clip.coverage *1.0 - totalMuCount = clip.mutationCount - mutations = [] - total_test = 0 - for chr,chrlen in clip.refInfo: - #logging.debug(chr) - try: - mufile = open(clip.outprefix+"."+chr+".mutations.bed") - except: - logging.info("Cannot open mutation file %s , move on." % (clip.outprefix+"."+chr+".mutations.bed")) - continue - for record in mufile: - total_test += 1 - info = record.rstrip().split("\t") - new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6]) - mutations.append(new_mu) - try: - pass - #os.remove(clip.outprefix+"."+chr+".mutations.bed") - except: - pass - logging.debug(len(mutations)) - KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed - try: - os.remove(clip.posfilteredBAM) - os.remove(clip.negfilteredBAM) - os.remove(clip.posfilteredBAM+".bai") - os.remove(clip.negfilteredBAM+".bai") - except: - pass - del clip.posfilteredBAM - del clip.negfilteredBAM - gc.collect()#logging.info("Finished K-M counting, starting fitting.") - - R = robject.r - reliableList = [] - P = totalMuCount/coverage - km_p = {}#store km and corresponding p value - pvalues = [] - for k in clip.kmpair:#KM_test: - parameters = k.split("_") - p = R.pbinom(int(parameters[1])-1,int(parameters[0]),P,False)[0] - pvalues.append(p) - km_p[k]=p - pCount = dict(Counter(pvalues)) - pRank = freqRank(pCount,True) - pqDic={} - for i in pRank.keys(): - try: - p_rank = pRank[i] - q_value = BH(i,p_rank,total_test) - pqDic[i]=q_value - except: - print >> sys.stderr,"Cannot find p value in dictionary" - continue - count = 0 - for mu in mutations: - name = str(mu.kvalue)+"_"+str(mu.score) - try: - mu.pvalue = km_p[name] - except: - #logging.debug("Problem with %s" % mu) - continue - mu.qvalue = pqDic[mu.pvalue] - if mu.qvalue <= threshold: - count += 1 - new_mutationName = "Mutation_"+str(count) - mu.name = new_mutationName - mu.sig = True - clip.sigMutationCount+=1 - clip.addSigToDic(clip.sigMutations,mu) - - mutations = None - gc.collect() - -def mutationEnrichWCtrl(clip,ctrlclip,threshold=0.01): - coverage = ctrlclip.coverage *1.0 - totalMuCount = ctrlclip.mutationCount - mutations = [] - total_test = 0 - for chr,chrlen in clip.refInfo: - try: - mufile = open(clip.outprefix+"."+chr+".mutations.bed") - except: - logging.info("Cannot open mutation file %s , move on." % (clip.outprefix+"."+chr+".mutations.bed")) - continue - for record in mufile: - total_test += 1 - info = record.rstrip().split("\t") - new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6]) - mutations.append(new_mu) - try: - os.remove(clip.outprefix+"."+chr+".mutations.bed") - os.remove(controlclip.outprefix+"."+chr+".mutations.bed") - except: - pass - KM_test = KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed - try: - os.remove(clip.posfilteredBAM) - os.remove(clip.negfilteredBAM) - os.remove(clip.posfilteredBAM+".bai") - os.remove(clip.negfilteredBAM+".bai") - except: - pass - del clip.posfilteredBAM - del clip.negfilteredBAM - gc.collect()#logging.info("Finished K-M counting, starting fitting.") - - R = robject.r - reliableList = [] - P = totalMuCount/coverage - km_p = {}#store km and corresponding p value - pvalues = [] - for k in clip.kmpair:#KM_test: - parameters = k.split("_") - p = R.pbinom(int(parameters[1])-1,int(parameters[0]),P,False)[0] - pvalues.append(p) - km_p[k]=p - pCount = dict(Counter(pvalues)) - pRank = freqRank(pCount,True) - pqDic={} - for i in pRank.keys(): - try: - p_rank = pRank[i] - q_value = BH(i,p_rank,total_test) - pqDic[i]=q_value - except: - print >> sys.stderr,"Cannot find p value in dictionary" - continue - count = 0 - for mu in mutations: - name = str(mu.kvalue)+"_"+str(mu.score) - try: - mu.pvalue = km_p[name] - except: - #logging.debug("Problem with %s" % mu) - continue - mu.qvalue = pqDic[mu.pvalue] - if mu.qvalue <= threshold: - count += 1 - new_mutationName = "Mutation_"+str(count) - mu.name = new_mutationName - mu.sig = True - clip.sigMutationCount+=1 - clip.addSigToDic(clip.sigMutations,mu) - - mutations = None - gc.collect() - -def clusterEnrich(clip,threshold=0.01): - cluster_filename = clip.outprefix+".clusters.bed"#clip.filepath.split("/")[-1].split(".")[0] - #Call R code and get result - epsilon = [0.01,0.15,0.1] - step = [0.1,0.08,0.05] - for index in range(len(epsilon)): - e = epsilon[index] - s = step[index] - r_args = ['Rscript','lib/ZTNB_tryCatch.R',cluster_filename,str(threshold),str(e),str(s)] - gc.collect() - p = subprocess.Popen(r_args) - stdout_value = p.communicate()[0] - try: - r_output_log = open(cluster_filename+".pipeclip.ztnblog","r") - #logging.debug("Log file opened") - flag = r_output_log.read(1) - if flag == "Y":#converged - break - elif flag=="N": - continue - except: - logging.info("No log file was produced by R code, continue regression using other parameters anyway.") - continue - - #check ztnb file - try: - enrich_parameter = open(cluster_filename+".pipeclip.ztnb","r") - except IOError,message: - logging.error("Cannot open ztnb result file") - return False - nbDic = {} - for item in enrich_parameter: - buf = item.rstrip().split("\t") - if buf[0]!="#": - nb_key = "_".join(buf[0:2]) #reads_length as key - #logging.debug("NB_key %s" % nb_key) - if not nbDic.has_key(nb_key): - nbDic[nb_key]=(buf[2],buf[3])#pvalue and qvalue - #logging.info("There are %d read-length pairs" % (len(nbDic.keys()))) - if len(nbDic.keys())==0: - logging.error("There are no read-length pairs found by ZTNB. Exit.") - return False - else: - for i in range(len(clip.clusters)): - r_key = str(clip.clusters[i].score)+"_"+str(clip.clusters[i].stop-clip.clusters[i].start) - #logging.debug("Keys from clip.clusters,%s" % r_key) - if nbDic.has_key(r_key): - clip.clusters[i].pvalue = nbDic[r_key][0] - clip.clusters[i].qvalue = nbDic[r_key][1] - clip.clusters[i].sig = True - clip.sigClusterCount += 1 - nbDic = None - try: - os.remove(cluster_filename) - except: - pass - if clip.sigClusterCount == 0: - return False - else: - return True - -def clusterEnrich_outsource(clip, threshold=0.01): - cluster_filename = clip.outprefix+".clusters.bed"#clip.filepath.split("/")[-1].split(".")[0] - #Call R code and get result - #epsilon = [0.01,0.15,0.1] - #step = [0.1,0.08,0.05] - sh_args = ['sh','lib/runR1.sh',cluster_filename,str(threshold)] - p = subprocess.Popen(sh_args) - stdout_value = p.communicate()[0] - - #check ztnb file - try: - enrich_parameter = open(cluster_filename+".pipeclip.ztnb","r") - except IOError,message: - logging.error("Cannot open ztnb result file") - return False - nbDic = {} - for item in enrich_parameter: - buf = item.rstrip().split("\t") - if buf[0]!="#": - nb_key = "_".join(buf[0:2]) #reads_length as key - #logging.debug("NB_key %s" % nb_key) - if not nbDic.has_key(nb_key): - nbDic[nb_key]=(buf[2],buf[3])#pvalue and qvalue - #logging.info("There are %d read-length pairs" % (len(nbDic.keys()))) - if len(nbDic.keys())==0: - logging.error("There are no read-length pairs found by ZTNB. Exit.") - return False - else: - for i in range(len(clip.clusters)): - r_key = str(clip.clusters[i].score)+"_"+str(clip.clusters[i].stop-clip.clusters[i].start) - #logging.debug("Keys from clip.clusters,%s" % r_key) - if nbDic.has_key(r_key): - clip.clusters[i].pvalue = nbDic[r_key][0] - clip.clusters[i].qvalue = nbDic[r_key][1] - clip.clusters[i].sig = True - clip.sigClusterCount += 1 - nbDic = None - try: - os.remove(cluster_filename) - except: - pass - if clip.sigClusterCount == 0: - return False - else: - return True - - -def fisherTest(clusterp,mutationp): - R = robject.r - min_mp = min(mutationp) - product = clusterp * min_mp - if product == 0: - fps = 0 - else: - xsq = -2*math.log(clusterp * min_mp) - fp = R.pchisq(xsq,**{'df':4,'lower.tail':False,'log.p':True})[0] - fps = math.exp(fp) - return fps - - diff --git a/lib/Mutation2.py b/lib/Mutation2.py deleted file mode 100644 index 04bc4a6..0000000 --- a/lib/Mutation2.py +++ /dev/null @@ -1,295 +0,0 @@ -#!/usr/bin/python -# programmer : bbc -# usage: -# output in BED format, the score is the offset of the mutation from 5' end - -import sys -import re -import random -import string -import copy -import pysam -from pysam import * -import argparse as ap -from Alignment import MutationBed -import logging -import OptValidator - -OptValidator.opt_validate() - - -def countMatchNumber(b): - myList = b - m = 0 - for i in myList: - if i[0]==0: - m += i[1] - return (m) - -def countInsertionNumber(b): - myList = b - m = 0 - for i in myList: - if i[0]==1: - m += i[1] - return (m) - -def countDeletionNumber(b): - myList = b - m = 0 - for i in myList: - if i[0]==2: - m += i[1] - return (m) - -def countMismatch(b): - myList = b - for i in myList: - if i[0]=="NM": - return (i[1]) - return 0 - -def survey(entry): - #logging.debug(entry.cigar) - mismatchNumber = countMismatch(entry.tags) - insertionNumber = countInsertionNumber(entry.cigar) - deletionNumber = countDeletionNumber(entry.cigar) - substitutionNumber = max(0,mismatchNumber-deletionNumber) - #print insertionNumber,deletionNumber - return ([insertionNumber,deletionNumber,substitutionNumber]) - - -def SBeforeFirstM(ci): - for index in range(len(ci)): - if ci[index][0] == 0: - if index == 0: - return 0 - else: - s = 0 - for i in range(0,index): - if ci[i][0] == 4: - s += ci[i][1] - return s -def parseCIGAR(ci): #calculate match segment length list - matchSeg = [] - preIndex = 0 - for index in range(len(ci)): - if ci[index][0] == 0: #match tag - preIndex = index - matchSeg.append(ci[index][1]) - if index > 0: #M is not the first tag - s = 0 - for i in range(preIndex-1,index): #check all the tags before M - if ci[i][0]==4: - s += ci[i][1] - matchSeg[-1] += s - #seach for the insertion behind (before the next match) - for insertionIndex in range(index+1,len(ci)): - inse = 0 - if ci[insertionIndex][0]==0: - break - elif ci[insertionIndex][0]==1:#insertion exists - inse += ci[insertionIndex][1] - matchSeg[-1]+=inse - return matchSeg - -def parseMD(b): - buf = [] - myList = b - for j in myList: - if j[0]=="MD": - md = copy.deepcopy(j[1]) - num = md.replace('A','^').replace('G','^').replace('T','^').replace('C','^').replace('^^','^').split('^') - for c in range(num.count("")): - num.remove("") - buf = [num[0]] - - counter = 1 - afterAlpha = 0 - for i in j[1]:#walk thought MD string to record mutation and deletion - if i.isalpha() or i == '^': - buf.append(i) - afterAlpha = 1 - else: - if afterAlpha and counter <= len(num)-1: - buf.append(num[counter]) - afterAlpha = 0 - counter += 1 - break - return buf - - -def insertionLocation(entry,num):#sam entry and insertion total number - insertionLoc = {} #key:ref_offset; value:list of seq_offset - ref_offset = 0 - seq_offset = [0] - preInsertion = 0 #used to check if it is the last insertion tag - for i in entry.cigar: - if i[0]==0 or i[0]==4:#match or soft clip - ref_offset += i[1] - seq_offset[-1] += i[1] - elif i[0]== 2: #deletion counts on ref but not on read seq - ref_offset+=i[1] - elif i[0]==1:#record this insertion tag and prepare for the next one - preInsertion += i[1] - for ins in range(1,i[1]): - newloc = seq_offset[-1]+1 - seq_offset.append(newloc) - st = [] - for item in seq_offset: - st.append(item) - insertionLoc[ref_offset]=st - if preInsertion == num: #this is the last insertion tag - return insertionLoc - else:#insertion only count on read seq - seq_index = seq_offset[0]+i[1] - -def countInsertionBefore(seqLoc,insertLocList): - if len(insertLocList)==0: - return 0 - else: - ins = 0 - for i in insertLocList: - if i>seqLoc: - return ins - else: - ins += 1 - return ins - -def mutationLocation(entry,insertLoc):#return mutation location in - mutations = [] - match = entry - myList = match.tags - S_count = SBeforeFirstM(match.cigar)#hard clip will cut the seq, doesn't count - mis = countMismatch(myList) - mdlist = parseMD(myList) - mdtag = ''.join(mdlist) - counter = 0 - if not mdtag.isdigit(): #insertions may exist - st_seq = 0 - st_genome = 0 - offset = 0 - pre = ':' - set_insertion = 0 #recored insertion numbers already counted - for ch in mdlist:#i[1] is the MD tag - if ch.isdigit(): - st_seq += int(ch) - st_genome += int(ch) - pre = ch - elif ch == "^": - st_genome += 1 - pre = ch - elif ch.isalpha(): - if (not pre == '^') and (pre.isdigit()): - origin = ch - index = st_seq+S_count+offset - insertionBefore = countInsertionBefore(index,insertLoc) - loc = st_genome+match.pos+offset#-insertionBefore #0-based - index += insertionBefore-set_insertion # add on 9 Oct,modify 9,9 2014 - set_insertion = insertionBefore - - #print index,len(match.seq) - mu = match.seq[index] - if mu.upper() != "N": - offset = index-S_count+1 - st_seq = 0 - st_genome = 0 - if match.is_reverse: - chr = '-' - t = RC([origin,mu]) - origin = t[0] - mu = t[1] - else: - chr = '+' - mutation = [str(loc),str(loc+1),match.qname,1,chr,origin+"->"+mu]#change offset into mutation count, which is 1 - yield mutation - else:#this is a deletion - if pre in ["A","G","T","C"]: - st_genome += 1 - loc = st_genome+match.pos+offset-1 #0-based - if match.is_reverse: - strand = '-' - ch = RC([ch])[0] - else: - strand = '+' - index1 = loc - match.pos - insertionBefore = countInsertionBefore(index1,insertLoc) - index1 += insertionBefore-set_insertion #added 9 Oct - set_insertion = insertionBefore - mutation = [str(loc),str(loc+1),match.qname,1,strand,"Deletion->"+ch] #change str(index1) to 1 - yield mutation - pre = ch - - return - -def RC(strList): - rc = [] - for item in strList: - st = "" - for chIndex in range(len(item)): - rcIndex = len(item)-1 - if item[rcIndex].upper()== "A": - st += 'T' - elif item[rcIndex].upper()=="C": - st += 'G' - elif item[rcIndex].upper()=="T": - st += 'A' - elif item[rcIndex].upper()=="G": - st += 'C' - else: - st += 'N' - rc.append(st) - return(rc) - -def getMutations(infile,read): - #tmp = pysam.Samfile("demo.bam",'wb',template=infile) - #logging.debug("call getMutation function %s" % read) - mutationList = [] - b=read.tags - - sur = survey(read) - insertion = sur[0] - deletion = sur[1] - substi = sur[2] - #logging.debug("insertio:,%d, deletion:%d,sub:%d" % (insertion,deletion,substi)) - insertionSeqLoc = [] - if insertion > 0: - insertionDic = insertionLocation(read,insertion) - for k in insertionDic.keys(): - for loc_index in range(len(insertionDic[k])): - insertionSeqLoc.append(insertionDic[k][loc_index]) - mu = read.seq[insertionDic[k][loc_index]] - loc = k+loc_index+read.pos - if read.tid >=0: - chr = infile.getrname(read.tid) - if read.is_reverse: - strand = '-' - mu = RC([mu])[0] - else: - strand = "+" - mutationList.append(MutationBed(chr,loc,loc+1,read.qname,1,strand,"Insertion->"+mu))#change insertionDic[k][loc_index] to 1 - insertionSeqLoc.sort() - if deletion + substi > 0: - for mu in mutationLocation(read,insertionSeqLoc): - if read.tid>=0: - chr = infile.getrname(read.tid) - newMu = MutationBed(chr,mu[0],mu[1],mu[2],mu[3],mu[4],mu[5]) - mutationList.append(newMu) - #logging.debug(mutationList) - return mutationList - - -def getTruncations(infile,read): - '''One read only have one truncation, but in order to be consistent with muations, return a list''' - if read.is_reverse: - strand = "-" - start = read.pos + read.alen - else: - strand = "+" - start = read.pos - stop = start + 1 - if read.tid>=0: - chr = infile.getrname(read.tid) - newTr = MutationBed(chr,start,stop,read.qname,1,strand,"truncation") - return [newTr] - diff --git a/lib/OptValidator.py b/lib/OptValidator.py deleted file mode 100644 index 7ea9234..0000000 --- a/lib/OptValidator.py +++ /dev/null @@ -1,7 +0,0 @@ -import sys -import logging - -def opt_validate (): - #opt validation to be added - #set logging - logging.basicConfig(format='%(levelname)s:%(message)s',level=logging.DEBUG) diff --git a/lib/Utils.py b/lib/Utils.py deleted file mode 100644 index fc3887d..0000000 --- a/lib/Utils.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/python -# programmer: beibei.chen@utsouthwestern.edu -# Usage: definition of utilities to handel pysam classes - -import sys -import Alignment -import subprocess -import logging - -def is_sorted(header): - '''Get a BAM header, and check if this file is sorted or not - Return: True: sorted, False: unsorted - Header variable is a list - ''' - for row in header: - buf = row.rstrip().split("\t") - if buf[0]=="@HD": - if buf[2] in ["SO:unsorted","SO:unknown"] : - return False - elif buf[2] in ["SO:coordinate","SO:queryname"]: - return True - #If no HD header contains SO info, return False - return False - -def readQuaFilter(read,mlen,mis): - matchlen = 0 - mismatch = 0 - #logging.debug("read %s,cigar %s,tags %s" % (read.qname,read.cigar,read.tags)) - try: - for i in read.cigar: - #logging.debug(i) - if i[0]==0: - matchlen += i[1] - elif i[0]==1: - mismatch += i[1] - for j in read.tags: - if j[0]=="NM": - mismatch += j[1] - break - #logging.debug("matchlen %d,mismatch %d" % (matchlen,mismatch)) - except: - #logging.debug("Problematic read %s" % (read)) - pass - if matchlen>=mlen and mismatch<=mis: - return (True,matchlen,mismatch) - else: - return (False,0,0) - -def rmdupKey_Start(read): - #print >>sys.stderr,"Remove duplicate by alignment start" - k = str(read.tid)+":"+str(read.pos)+":"+str(read.is_reverse) - #print >>sys.stderr,k - return k - - -def rmdupKey_Seq(read): - k = str(read.tid)+":"+str(read.pos)+":"+str(read.is_reverse)+":"+read.seq - return k - -def filterMutations(mlist,feature,keep=True): - newList = [] - for i in mlist: - if i.type==feature: - if keep: - newList.append(i) - else: - if not keep: - newList.append(i) - return newList - -def annotation(fileaddr,species): - '''Call Homer annotation script''' - outfile = open(fileaddr+".anno","w") - args = ['annotatePeaks.pl',fileaddr,species] - p = subprocess.Popen(args,stdout=outfile) - stdout_value = p.communicate()[0] - outfile.close() - -def makeWig(bamfile): - wig = {} - for r in bamfile.references: - wig[r] = {} - for pileupColumn in bamfile.pileup(r): - wig[r][str(pileupColumn.pos)]=pileupColumn.n - return wig - - -def makeWigByChr(bamfile,chr): - wig = {} - for pileupColumn in bamfile.pileup(chr): - wig[pileupColumn.pos]=pileupColumn.n - return wig - - -def makeWigListByChr(bamfile,chr): - wig = Alignment.wiglist() - for pileupColumn in bamfile.pileup(chr): - wig.update(pileupColumn.pos,pileupColumn.n) - return wig - - -def makeWigListByChr_array(bamfile,chr,chrlen): - wig = [] - for i in range(chrlen): - wig.append(0) - for pileupColumn in bamfile.pileup(chr): - wig[pileupColumn.pos]=pileupColumn.n - return wig - -def bisort(alist,b): - '''List is a list of bed instances, b is also an instance''' - if isinstance(alist[0],Alignment.BED) and isinstance(b,Alignment.BED): - pass diff --git a/lib/ZTNB_tryCatch.R b/lib/ZTNB_tryCatch.R deleted file mode 100755 index 710b81a..0000000 --- a/lib/ZTNB_tryCatch.R +++ /dev/null @@ -1,173 +0,0 @@ -#Require at least R3.0 and package VGAM and dependencies -#Programmer: jonghyun.yun@utsouthwestern.edu and beibei.chen@utsouthwestern.edu -#Usage: Get all the reads count and length combinations of enriched clusters -#Input: BED file with score column as reads number -#Output: table with reads count, length, p value and fdr -#Last modified: 20 Sep 2014 - -#command line parameters: -#1:input file name. After running the code, two output files: input.ztnb and input.ztnblog will be created -#2:FDR cutoff -#3:Regression epsilon -#4:Regression step - -suppressMessages(require(MASS)); -suppressMessages(require(VGAM)); - -#require(MASS) -#require(VGAM) -args = commandArgs(TRUE); -data = read.table(args[1], sep = "\t"); -len = data[,3] - data[,2]; -read = data[,5]; -cut = as.numeric(as.character(args[2])); -vglm_epsilon = as.numeric(as.character(args[3])); -vglm_step = as.numeric(as.character(args[4])); -# tabularize the data -wts = table(read,len); -temp.coln = as.integer(names(wts[1,])); -temp.rown = as.integer(names(wts[,1])); -drow = length(temp.rown); -dcol = length(temp.coln); - -rown = matrix(rep(temp.rown,dcol),drow,dcol); -coln = t(matrix(rep(temp.coln,drow),dcol,drow)); - -wdx = which(wts>0); -tread = rown[wdx]; -tlen = coln[wdx]; -twts = wts[wdx]; -#print("local polynomial regression") -# local polynomial regression: read ~ len -# smoothing parameter: -alp = 0.95 -# polynomial degree: -pd = 1 -lregfit = loess(tread ~ tlen, weights = twts, span = alp, family ="symmetric", degree = pd); - -mu = lregfit$fitted; -mu[mu<=0] = min(exp(-4),mu[mu>0]); # bounded mean function -logmu = log(mu); - - -# compute p-values using N(0,1) for (tread[-cdx] - mu[-cdx]) / sqrt(mu[-cdx]) -nb.pvalue=numeric(length(tread)); -zscore = (tread-mu)/sqrt(mu); -cdx = which(zscore < 4); -nb.pvalue[-cdx] = pnorm(zscore[-cdx],lower.tail=FALSE); - -tread_fit = tread[cdx]; -tlen_fit = tlen[cdx]; -twts_fit = twts[cdx]; - -lregfit = loess(tread_fit ~ tlen_fit, weights = twts_fit, span = alp, family ="symmetric", degree = pd); -mu = lregfit$fitted; -mu[mu<=0] = min(exp(-4),mu[mu>0]); # bounded mean function -logmu = log(mu); - -# negative binomail regression with the known predictor log(mu) - -intercept1 = seq(-1,1,0.5); -intercept2 = seq(-1,1,0.5); -biggest_likelihood = -99999999; -khat = 0; -muhat = 0; -converge_flag = FALSE; -errmsg = paste("Error/warning messages for run using Epsilon:",vglm_epsilon,"and step size:",vglm_step,"."); -options(warn=1); - - -#use default setting first to see if it could converge -nb<-tryCatch({vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step,offset = logmu,epsilon=vglm_epsilon,silent=FALSE)},warning = function(w){ - #print(paste("typeof warning",typeof(w["message"]))) - warnmsg = strsplit(toString(w["message"])," iteration")[[1]][1]; - if(grepl("convergence not obtained",warnmsg)) - { - newmsg = paste("Convergence not obtained in 200 iterarions"); - return(newmsg); - } - }, error = function(e){ - newmsg = paste("Program failed to converge."); - return(newmsg) - }, finally = { - })# end of try-catch - if (length(nb)>0) - { - if(typeof(nb)=="S4") - { - #print("Converged, get regression class") - if(logLik(nb)>=biggest_likelihood & Coef(nb)["size"]<=100) - { - biggest_likelihood = logLik(nb); - khat <- Coef(nb)["size"]; - muhat <- Coef(nb)["munb"]; - } - - } else if (typeof(nb)=="character") - { - errmsg = paste(errmsg,nb,sep="\n") - } - } - - -if((biggest_likelihood==-99999999) && (khat==0) && (muhat ==0)) -{#try different coef start -for (i in intercept1) -{ - for (j in intercept2) - { - nb<-tryCatch({vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step,offset = logmu,epsilon=vglm_epsilon,silent=FALSE,coefstart=c(i,j))},warning = function(w){ - warnmsg = strsplit(toString(w["message"])," iteration")[[1]][1]; - if(grepl("convergence not obtained",warnmsg)) - { - newmsg = paste("Using coefstart c(",i,",",j,"), convergence not obtained in 200 iterarions"); - return(newmsg); - } - }, error = function(e){ - newmsg = paste("Using coefstart c(",i,",",j,"), program failed to converge."); - return(newmsg) - }, finally = { - })# end of try-catch - if (length(nb)>0) - { - if(typeof(nb)=="S4") - { - #print("Converged, get regression class") - if(logLik(nb)>=biggest_likelihood & Coef(nb)["size"]<=100) - { - biggest_likelihood = logLik(nb); - khat <- Coef(nb)["size"]; - muhat <- Coef(nb)["munb"]; - } - - } else if (typeof(nb)=="character") - { - errmsg = paste(errmsg,nb,sep="\n") - } - } - }# end of loop intercept2 - }# end of loop intercept1 -}#end of try different coefstart - -if((biggest_likelihood==-99999999) && (khat==0) && (muhat ==0)) -{ - #No model converged,exit - errmsg = paste("N:No model converged for this run.",errmsg,sep="\n"); -} else { - # model parameters - errmsg = paste("Y:Coverged for this run.",errmsg,sep="\n"); - write(errmsg,paste(args[1],".Converge.txt",sep="")) - nbsize = khat * mu; - nbmu = muhat * mu; - nb.pvalue[cdx] = pnbinom(tread_fit - 1, size = nbsize, mu = nbmu, lower.tail = FALSE) / (1-dnbinom(0, size = nbsize, mu = nbmu)) - nb.fdr = p.adjust(nb.pvalue,method='BH') - # output - nbdx = nb.fdr <= cut; - #sum(twts[nbdx]); # the number of clusters whose FDR <= cut - # read counts and cluster length of clusters whose FDR <= cut - nb.out = as.data.frame(matrix(c(tread[nbdx],tlen[nbdx],nb.pvalue[nbdx],nb.fdr[nbdx]),sum(nbdx),4)) - names(nb.out) = c("read","length","p","fdr") - outname = paste(args[1],".pipeclip.ztnb",sep="") - write.table(nb.out,outname,sep="\t",quote=F,row.names=F); -}#end of output -write(errmsg,paste(args[1],".pipeclip.ztnblog",sep="")); diff --git a/lib/__init__.py b/lib/__init__.py deleted file mode 100644 index af5a554..0000000 --- a/lib/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -#indicate this folder contains packages -import Alignment -import Enrich -import CLIP -import Mutation2 -import Utils -import OptValidator diff --git a/lib/barcodeRemoval.py b/lib/barcodeRemoval.py deleted file mode 100755 index 95c1adf..0000000 --- a/lib/barcodeRemoval.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/python -# Programmer : beibei.chen@utsouthwestern.edu -# Usage: remove PCR duplicates by comparing the sequences and return fastq file with barcode removed. -# Input: fastq file -# Output: fastq file -# Last modified: 28 June 2014 - Beibei Chen - -import sys -import re -import string - -class fastq: - def __init__(self,x): - self.id = x[0] - self.seq = x[1] - self.name = x[2] - self.quality = x[3] - - def __str__(self): - st = self.id+self.seq+self.name+self.quality - return st - -class fqList: - def __init__(self): - self.data = [] - self.unique = {} - - def readFq(self,fh,bl): - st = [] - buf = fh.readlines() - for i in range(len(buf)): - if buf[i][0]=="@" and buf[i+2][0]=="+": - st.append(buf[i]) - st.append(buf[i+1]) - st.append(buf[i+2]) - st.append(buf[i+3].rstrip()) - self.addFq(st) - if not self.unique.has_key(buf[i+1]): - self.addUniqFq(st,bl) - st = [] - - def addFq(self,s): - newFq = fastq(s) - self.data.append(newFq) - - def addUniqFq(self,s,b): - seq_key = s[1] - offset = b - newFq = fastq([s[0],s[1][offset:],s[2],s[3][offset:]]) - self.unique[seq_key] = newFq - -class barcodeRemover: - def __init__(self,infile,barLen): - self.infile = infile - self.barLen = barLen - - def run(self): - myfq = fqList() - myfq.readFq(self.infile,self.barLen) - for item in myfq.unique.values(): - if len(item.seq)>=15: - print item - -def barCodeRemovalMain(infilePath,barLenStr): - try: - infile = open(infilePath,"r+") - except IOError,message: - print >> sys.stderr, "cannot open file",message - sys.exit(1) - try: - barLen = int(barLenStr) - except: - barLen = 5 - barcodeRemover = barcodeRemover(infile,barLen) - barcodeRemover.run() - -def barCodeRemovalMain(): - try: - infile = open(sys.argv[1],"r+") - except IOError,message: - print >> sys.stderr, "cannot open file",message - sys.exit(1) - try: - barLen = int(sys.argv[2]) - except: - barLen = 5 - myBarcodeRemover = barcodeRemover(infile,barLen) - myBarcodeRemover.run() - -if __name__=="__main__": - barCodeRemovalMain() diff --git a/lib/pipeclip.py b/lib/pipeclip.py deleted file mode 100755 index 3b3c4b9..0000000 --- a/lib/pipeclip.py +++ /dev/null @@ -1,110 +0,0 @@ -#Main pipeline connects all the scripts together -#Original Programmer: beibei.chen@utsouthwestern.edu -#Refactored by: eric.roos@utsouthwestern.edu -#Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species -#Required packages: pysam, ghmm, pybedtools -#Last modification: 18 Sep 2014 - -#from lib import * - -import sys -import argparse -import logging -from subprocess import call -import os -import gc -import CLIP -import Alignment -import Utils -import Enrich -import OptValidator - -gc.enable() - -def prepare_argparser(): - description = "Find mutations" - epilog = "For command line options of each command, type %(prog)s COMMAND -h" - argparser = argparse.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile", type = str, required = True, help = "input bam file") - argparser.add_argument("-o","--output",dest = "outfile", type = str,required = True, help = "output file, default is stdout") - argparser.add_argument("-l","--matchLength",dest = "matchLength", type = int ,required = True, help = "shorted matched segment length") - argparser.add_argument("-m","--mismatch",dest = "mismatch", type = int,required = True, help = "maximum mismatch number") - argparser.add_argument("-c","--clipType",dest = "clipType", type = int,required = True, help = "CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", choices=[0,1,2,3]) - argparser.add_argument("-r","--rmdup",dest = "dupRemove", type = int,required = True, help = "Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence; ", choices=[0,1,2]) - argparser.add_argument("-M","--fdrMutation",dest = "fdrMutation", type = float,required = True, help = "FDR for reliable mutations") - argparser.add_argument("-C","--fdrCluster",dest = "fdrCluster", type = float,required = True, help = "FDR for enriched clusters") - argparser.add_argument("-s","--species",dest = "species", type = str,required = True, help = "Species [\"mm10\",\"hg19\"]",choices=["mm10","hg19"]) - return(argparser) - -def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,clipType,fdrReliableMutation,species): - myClip = CLIP.CLIP(infile,outputPrefix) - logging.info("Start to run") - if myClip.testInput():#check input - logging.info("Input file OK,start to run PIPE-CLIP") - logging.info("Species info %s" % species) - if myClip.readfile(): - myClip.filter(matchLength,mismatch,clipType,rmdup,outputPrefix) - myClip.printMutations() - if len(myClip.clusters)>0: - logging.info("Get enriched clusters") - gc.collect() - status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) - if status: - logging.info("Found %d enriched clusters" % myClip.sigClusterCount) - myClip.printEnrichedClusters() - else: - logging.error("There is no enriched cluster found. Exit program") - sys.exit(1) - else: - logging.error("There is no clusters found. Please check input.Exit program.") - sys.exit(1) - - if len(myClip.mutations.keys())>0: - logging.info("Get reliable mutations") - Enrich.mutationEnrich(myClip,fdrReliableMutation) - myClip.printReliableMutations() - else: - logging.warning("There is no mutation found in this BAM file.") - #Start to get crosslinking sites - if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: - logging.info("Get cross-linking sites") - myClip.getCrosslinking() - if (len(myClip.crosslinking.keys())>0): - outfilelist = myClip.printCrosslinkingSites() - myClip.printCrosslinkingMutations() - else: - logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") - outfilelist = myClip.printEnrichClusters() - else: - if myClip.sigClusterCount <= 0: - logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") - sys.exit(2) - elif myClip.sigMutationCount <=0: - logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") - outfilelist = myClip.printEnrichClusters() - #annotation if possible - #logging.debug(outfilelist) - if species in ["mm10","mm9","hg19"]: - for name in outfilelist: - #logging.debug("Start to do annotation for %s" % name) - Utils.annotation(name,species) - else: - print >> sys.stderr, "File corruputed, program exit." - sys.exit(0) - - - -if __name__=="__main__": - arg_parser = prepare_argparser() - args = arg_parser.parse_args() - OptValidator.opt_validate() - infile = args.infile # Input SAM/BAM file - outputPrefix = args.outfile # Output prefix - matchLength = args.matchLength # Shorted matched segment length - mismatch = args.mismatch # Maximum mismatch number - rmcode = args.dupRemove - fdrEnrichedCluster = args.fdrCluster # FDR for enriched clusters - clipType =args.clipType # CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP - fdrReliableMutation = args.fdrMutation# FDR for reliable mutations - species = args.species # Species ["mm10","hg19"] - runPipeClip(infile,outputPrefix,matchLength,mismatch,rmcode,fdrEnrichedCluster,clipType,fdrReliableMutation,species) diff --git a/pipeclip.py b/pipeclip.py deleted file mode 100755 index e61893e..0000000 --- a/pipeclip.py +++ /dev/null @@ -1,130 +0,0 @@ -#Main pipeline connects all the scripts together -#Original Programmer: beibei.chen@utsouthwestern.edu -#Refactored by: eric.roos@utsouthwestern.edu -#Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species -#Required packages: pysam, ghmm, pybedtools - - -import sys -import argparse -import logging -import os -from time import gmtime, strftime -from lib import * - -def prepare_argparser(): - description = "Find mutations" - epilog = "For command line options of each command, type %(prog)s COMMAND -h" - argparser = argparse.ArgumentParser(description=description, epilog = epilog) - argparser.add_argument("-i","--input",dest = "infile", type = str, required = True, help = "input bam file") - argparser.add_argument("-t","--control",dest="ctrlfile",type=str,required=False, help = "control bam file") - argparser.add_argument("-o","--output",dest = "outfile", type = str,required = True, help = "output file, default is stdout") - argparser.add_argument("-l","--matchLength",dest = "matchLength", type = int ,required = True, help = "shorted matched segment length") - argparser.add_argument("-m","--mismatch",dest = "mismatch", type = int,required = True, help = "maximum mismatch number") - argparser.add_argument("-c","--clipType",dest = "clipType", type = int,required = True, help = "CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", choices=[0,1,2,3]) - argparser.add_argument("-r","--rmdup",dest = "dupRemove", type = int,required = True, help = "Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence; ", choices=[0,1,2]) - argparser.add_argument("-M","--fdrMutation",dest = "fdrMutation", type = float,required = True, help = "FDR for reliable mutations") - argparser.add_argument("-C","--fdrCluster",dest = "fdrCluster", type = float,required = True, help = "FDR for enriched clusters") - argparser.add_argument("-s","--species",dest = "species", type = str, help = "Species [\"mm10\",\"hg19\"]",choices=["mm10","hg19"]) - return(argparser) - -def runPipeClip(infile,control,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,clipType,fdrReliableMutation,species): - myClip = CLIP.CLIP(infile,outputPrefix) - controlFlag = False - if control != None: - controlClip = CLIP.CLIP(control,outputPrefix+"Control") - logging.info("Start to run") - if myClip.testInput():#check input - logging.info("Input file OK,start to run PIPE-CLIP") - logging.info("Species info %s" % species) - if control != None: #test control file - if controlClip.testInput(): - logging.info("Control file OK. Use control in mutation enrichment.") - controlFlag = True - else: - logging.info("Control file format error. Continue without control.") - if myClip.readfile(): - myClip.filter2(matchLength,mismatch,clipType,rmdup) - if controlFlag: - logging.info("Read in control file") - controlClip.readfile() - controlClip.filter2(matchLength,mismatch,clipType,rmdup) - #myClip.printClusters() - #myClip.printMutations() - if myClip.clusterCount>0: - logging.info("Get enriched clusters") - status = Enrich.clusterEnrich_outsource(myClip,fdrEnrichedCluster) - if status: - logging.info("Found %d enriched clusters" % myClip.sigClusterCount) - myClip.printEnrichedClusters() - else: - logging.error("There is no enriched cluster found. Exit program") - sys.exit(1) - else: - logging.error("There is no clusters found. Please check input.Exit program.") - sys.exit(1) - - if myClip.mutationCount>0: - logging.info("Get reliable mutations") - if controlFlag: #use control - Enrich.mutationEnrichWCtrl(myClip,controlClip,fdrReliableMutation) - else: - Enrich.mutationEnrich(myClip,fdrReliableMutation) - logging.info("There are %d reliable mutations" % myClip.sigMutationCount) - myClip.printReliableMutations() - else: - logging.warning("There is no mutation found in this BAM file.") - #Start to get crosslinking sites - if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: - logging.info("Get cross-linking sites") - myClip.getCrosslinking() - if (len(myClip.crosslinking.keys())>0): - outfilelist = myClip.printCrosslinkingSites() - myClip.printCrosslinkingMutations() - else: - logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") - outfilelist = myClip.printEnrichClusters() - else: - if myClip.sigClusterCount <= 0: - logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") - sys.exit(2) - elif myClip.sigMutationCount <=0: - logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") - outfilelist = myClip.printEnrichClusters() - #annotation if possible - if species in ["mm10","mm9","hg19"]: - logging.info("Started to annotate cross-linking sits using HOMER") - for name in outfilelist: - #logging.debug("Start to do annotation for %s" % name) - Utils.annotation(name,species) - #output a status log file - logfile = open(outputPrefix+".pipeclip.summary.log","w") - print >>logfile,"PIPE-CLIP run finished. Parameters are:" - print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation) - print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment) - print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters)) - print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount) - print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations)) - logfile.close() - logging.info("PIPE-CLIP finished the job, please check your results. :)") - else: - logging.error("File corruputed, program exit.") - sys.exit(0) - - - -if __name__=="__main__": - arg_parser = prepare_argparser() - args = arg_parser.parse_args() - OptValidator.opt_validate() - infile = args.infile # Input SAM/BAM file - control = args.ctrlfile - outputPrefix = args.outfile # Output prefix - matchLength = args.matchLength # Shorted matched segment length - mismatch = args.mismatch # Maximum mismatch number - rmcode = args.dupRemove - fdrEnrichedCluster = args.fdrCluster # FDR for enriched clusters - clipType =args.clipType # CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP - fdrReliableMutation = args.fdrMutation# FDR for reliable mutations - species = args.species # Species ["mm10","hg19"] - runPipeClip(infile,control,outputPrefix,matchLength,mismatch,rmcode,fdrEnrichedCluster,clipType,fdrReliableMutation,species) diff --git a/pipeclip/Alignment.py b/pipeclip/Alignment.py new file mode 100644 index 0000000..51815e6 --- /dev/null +++ b/pipeclip/Alignment.py @@ -0,0 +1,121 @@ +#!/usr/bin/python +# programmer: beibei.chen@utsouthwestern.edu +# Usage: definition of alignment files including BED and BAM + + +from . import Enrich + + +class BED: + def __init__(self, chr, start, stop, name, score, strand): + self.chr = chr + self.start = int(start) + self.stop = int(stop) + self.name = name + self.score = int(score) + self.strand = strand + + def __str__(self): + st = "\t".join( + [ + str(self.chr), + str(self.start), + str(self.stop), + self.name, + str(self.score), + self.strand, + ] + ) + return st + + def merge(self, read): + self.stop = read.stop + self.score += 1 + + def overlap(self, read): + if self.chr == read.chr and self.strand == read.strand: + if self.start <= read.stop and self.stop >= read.start: + return True + else: + return False + + def updateScore(self, s): + self.score = int(s) + + def increaseScore(self): + self.score += 1 + + +class ClusterBed(BED): + def __init__(self, chr, start, stop, name, score, strand): + self.pvalue = 0 + self.qvalue = 0 + self.sig = False + BED.__init__(self, chr, start, stop, name, score, strand) + + +class MutationBed(BED): + def __init__(self, chr, start, stop, name, score, strand, type): + self.type = type # insertion,deletion,type of substitution + self.kvalue = 0 + BED.__init__(self, chr, start, stop, name, score, strand) + self.pvalue = 0 + self.qvalue = 0 + self.sig = False + + def updateK(self, k): + self.kvalue = k + + def __str__(self): + st = BED.__str__(self) + st += "\t" + self.type + "\t" + str(self.kvalue) + return st + + +class CrosslinkingBed(BED): + def __init__( + self, + chr, + start, + stop, + name, + score, + strand, + clusterP, + clusterQ, + mStart, + mName, + mP, + ): + self.fisherP = 0 + self.clusterP = float(clusterP) + self.mutationP = [mP] + self.qvalue = float(clusterQ) + self.mutationStarts = [str(mStart)] + self.mutationNames = [mName] + BED.__init__(self, chr, start, stop, name, score, strand) + + def addMutation(self, mu): # mu is a instance + self.mutationNames.append(mu.name) + self.mutationStarts.append(str(mu.start)) + self.mutationP.append(mu.pvalue) + + def fishertest(self): + fp = Enrich.fisherTest(self.clusterP, self.mutationP) + self.fisherP = fp + + +class wiglist: + def __init__(self): + self.pos = [] + self.value = [] + + def valueByPos(self, p): + try: + return self.value[self.pos.index(p)] + except: + return False + + def update(self, p, v): + self.pos.append(p) + self.value.append(v) diff --git a/pipeclip/CLIP.py b/pipeclip/CLIP.py new file mode 100644 index 0000000..ee4ec08 --- /dev/null +++ b/pipeclip/CLIP.py @@ -0,0 +1,678 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Define CLIP class.""" + +import datetime +import gc +import gzip +import random + +import pysam + +from . import Alignment, Utils, mutation + +LOGGER = Utils.get_logger(__name__) + +gc.enable() + + +class CLIP: + def __init__(self, fileaddr, prefix): + self.filepath = fileaddr + self.originalBAM = None + self.originalMapped = 0 + self.refInfo = [] + self.posfilteredBAM = None + self.negfilteredBAM = None + self.filteredAlignment = 0 + self.type = 0 + self.outprefix = prefix + self.currentGroupKey = "None" + self.currentGroup = [] # Used in rmdup + self.previousQul = [0, 0, 0] # for rmdup,[matchlen,mapq,mismatch] + self.clusters = [] + self.clusters_plus = [] + self.clusters_minus = [] + self.clusterCount = 0 + self.currentCluster_plus = Alignment.BED("", 0, 0, "", 0, "+") + self.currentCluster_minus = Alignment.BED("", 0, 0, "", 0, "-") + # self.sigClusters = {} + self.mutations = {} # Dictionary of bed instance + self.mutationCount = 0 + self.mutationList = [] + self.sigMutations = {} # same as sigClusters + self.kmpair = {} + self.sigMutationCount = 0 + self.sigClusterCount = 0 + self.coverage = 0 # "reads coverage of this sample" + self.bamheader = None + self.crosslinking = {} + self.crosslinkingMutations = [] + + def __str__(self): + pass + + def testInput(self): + """Test the input file format, modify self.filepath and return bool + Status: True:file is ready to use; False: wrong file, program stop + """ + # test if file has header + try: + self.header = pysam.view("-H", self.filepath) + except: + try: + self.header = pysam.view("-SH", self.filepath) + except: + LOGGER.error( + "Input file does not have header, please check your file. Program quit" + ) + return (False, "None") + # Header test passed, test if it is BAM + try: + infile = gzip.open(self.filepath) + infile.readline(10) + except: # cannot read line, should be sam + LOGGER.info("Input is SAM, converting to BAM...") + bamout = ".".join(self.filepath.split(".")[0:-1]) + "." + "bam" + infile = pysam.Samfile(self.filepath, "r", header=self.header) + # print >> sys.stderr,pysam.view("-SH",infile) + outfile = pysam.Samfile(bamout, "wb", template=infile) + for i in infile.fetch(): + outfile.write(i) + self.filepath = bamout + # Now the infile is BAM,check if it is sorted + if Utils.is_sorted(self.header): + pysam.index(self.filepath) + return True + else: # sort the BAM + LOGGER.info("Input is not sorted, sorting file...") + bamsort = ( + ".".join(self.filepath.split(".")[0:-1]) + "." + "sort.bam" + ) + pysam.sort("-o", bamsort, self.filepath) + pysam.index(bamsort) + self.filepath = bamsort # change input file path + self.header = pysam.view("-H", bamsort) + LOGGER.info("Input file sorted") + # if Utils.is_sorted(self.header): + # print >> sys.stderr, "The file is sorted" + return True + + def readfile(self): + try: + self.originalBAM = pysam.Samfile(self.filepath, "rb") + self.refInfo = list( + zip(self.originalBAM.references, self.originalBAM.lengths) + ) + return True + except IOError: + LOGGER.error("Cannot open input file") + return False + + # def printFilteredReads(self): + # for i in self.filteredAlignment: + # print i + + def printClusters(self): + outfile = open(self.outprefix + ".clusters.bed", "w") + for i in [self.clusters_plus, self.clusters_minus]: + for j in i: + print(j, file=outfile) + outfile.close() + + def printMutations(self): + outfile = open(self.outprefix + ".mutations.bed", "w") + for i in list(self.mutations.values()): + print(i, file=outfile) + outfile.close() + + def printMutations_chr(self): + fhs = {} # file handle dic, key is chr, value is file handle + for chr, chrlen in self.refInfo: + outfile = open(self.outprefix + "." + chr + ".mutations.bed", "w") + fhs[chr] = outfile + for i in self.mutations.values(): + print(i, file=fhs[i.chr]) + # clean up + for fh in fhs.values(): + fh.close() + del fhs + + def printMutationSingleChr(self, chr): + outfile = open(self.outprefix + "." + chr + ".mutations.bed", "w") + for i in self.mutations.values(): + print(i, file=outfile) + # clean up + outfile.close() + + def printReliableMutations(self): + outfile = open(self.outprefix + ".reliableMutations.pipeclip.bed", "w") + header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr" + print(header, file=outfile) + self.printEnrichedItem(self.sigMutations, outfile) + + def printEnrichedClusters(self): + outfile = open(self.outprefix + ".enrichedClusters.pipeclip.bed", "w") + header = ( + "#chr\tstart\tstop\tcluster_name\tread_count\tstrand\tp_value\tfdr" + ) + print(header, file=outfile) + self.printReliableList(self.clusters, outfile) + return [self.outprefix + ".enrichedClusters.pipeclip.bed"] + + def printCrosslinkingMutations(self): + outfile = open( + self.outprefix + ".crosslinkingMutations.pipeclip.bed", "w" + ) + header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr" + print(header, file=outfile) + self.printReliableList(self.crosslinkingMutations, outfile) + + def printReliableList(self, mylist, fh): + for i in mylist: + if i.sig: + st = i.__str__() + st += "\t" + str(i.pvalue) + "\t" + str(i.qvalue) + print(st, file=fh) + + def printList(self, mylist, fh): + for i in mylist: + print(i, file=fh) + + def printEnrichedItem(self, dic, fh): + for k in list(dic.keys()): + for i in dic[k]: + st = i.__str__() + st += "\t" + str(i.pvalue) + "\t" + str(i.qvalue) + print(st, file=fh) + + def printCrosslinkingSites(self): + output = self.outprefix + header = "#chr\tstart\tstop\tcluster_name\treads_count\tstrand\tcluster_fdr\tcrosslinking_fisherP\tmutation_pos\tmutation_name" + if self.type == 0: # HITS-CLIP, three output + output_del = open( + output + "_deletion_crosslinking.pipeclip.txt", "w" + ) + output_sub = open( + output + "_substitution_crosslinking.pipeclip.txt", "w" + ) + output_ins = open( + output + "_insertion_crosslinking.pipeclip.txt", "w" + ) + print(header, file=output_del) + print(header, file=output_sub) + print(header, file=output_ins) + else: + output_name = open(output + "_crosslinking.pipeclip.txt", "w") + print(header, file=output_name) + for k in list(self.crosslinking.keys()): + st = self.crosslinking[k].__str__() + st += "\t" + "\t".join( + [ + str(self.crosslinking[k].qvalue), + str(self.crosslinking[k].fisherP), + ] + ) + st += "\t" + ",".join(self.crosslinking[k].mutationStarts) + st += "\t" + ",".join(self.crosslinking[k].mutationNames) + if self.type == 0: + output_key = k.split("_")[-1] + if output_key == "Deletion": + print(st, file=output_del) + elif output_key == "Insertion": + print(st, file=output_ins) + elif output_key == "Substitution": + print(st, file=output_sub) + else: + print(st, file=output_name) + + if self.type == 0: + output_del.close() + output_sub.close() + output_ins.close() + return [ + output + "_insertion_crosslinking.pipeclip.txt", + output + "_deletion_crosslinking.pipeclip.txt", + output + "_substitution_crosslinking.pipeclip.txt", + ] + else: + output_name.close() + return [output + "_crosslinking.pipeclip.txt"] + + def updatePreviousQul(self, n, q, m): + self.previousQul[0] = n + self.previousQul[1] = q + self.previousQul[2] = m + + def updateCurrentGroup(self, read, mlen, mis): + """Compare read to current duplication group parameters, determine to + add to, to drop or to replace, make sure duplication group only has + reads with best quality.""" + if ( + mlen >= self.previousQul[0] + and read.mapq >= self.previousQul[1] + and mis <= self.previousQul[2] + ): # consider to append or replace only when read has no worse quality + if ( + mlen > self.previousQul[0] + or read.mapq > self.previousQul[1] + or mis < self.previousQul[2] + ): # read has better quality,replace + self.currentGroup = [read] + self.updatePreviousQul(mlen, read.mapq, mis) + else: + self.currentGroup.append(read) + + def iniDupGroupInfo(self, read, group_key, mlength, mismatch): + self.currentGroupKey = group_key + self.currentGroup = [read] + self.updatePreviousQul(mlength, read.mapq, mismatch) + + def rmdup(self): + """Return random one read of highest quality from list.""" + # print "get one from group" + if len(self.currentGroup) == 1: + # print self.currentGroup[0] + return self.currentGroup[0] + else: + random.seed(1) + index = random.randint(0, len(self.currentGroup) - 1) + # print self.currentGroup[index] + return self.currentGroup[index] + + def updateCluster(self, read): + """Cluster new read to known clusters and update cluster reads + count.""" + strandDic = {"True": "-", "False": "+"} + clusterName = ( + "cluster" + + "_" + + str(len(self.clusters_plus) + len(self.clusters_minus) + 1) + ) + newRead = Alignment.ClusterBed( + self.originalBAM.getrname(read.tid), + read.pos, + read.pos + len(read.seq), + clusterName, + 1, + strandDic[str(read.is_reverse)], + ) + if read.is_reverse: + if self.currentCluster_minus.chr == "": # Initiate cluster + self.currentCluster_minus = newRead + self.clusters_minus.append(self.currentCluster_minus) + self.clusterCount += 1 + else: + if self.currentCluster_minus.overlap(newRead): + self.currentCluster_minus.merge(newRead) + self.clusters_minus[-1] = self.currentCluster_minus + else: # New read is a new cluster + # self.clusters.append(self.currentCluster) + self.currentCluster_minus = newRead + self.clusters_minus.append(self.currentCluster_minus) + self.clusterCount += 1 + else: + if self.currentCluster_plus.chr == "": # Initiate cluster + self.currentCluster_plus = newRead + self.clusters_plus.append(self.currentCluster_plus) + self.clusterCount += 1 + else: + if self.currentCluster_plus.overlap(newRead): + self.currentCluster_plus.merge(newRead) + self.clusters_plus[-1] = self.currentCluster_plus + else: # New read is a new cluster + # self.clusters.append(self.currentCluster) + self.currentCluster_plus = newRead + self.clusters_plus.append(self.currentCluster_plus) + self.clusterCount += 1 + + def updateMutation(self, read, mis): + mutations = [] + if self.type == 3: # iclip,find truncation + mutations = mutation.getTruncations(self.originalBAM, read) + else: + mutations = mutation.getMutations(self.originalBAM, read) + if self.type == 1: + mutation_filter = Utils.filterMutations( + mutations, "T->C", True + ) + mutations = mutation_filter + elif self.type == 2: + mutation_filter = Utils.filterMutations( + mutations, "G->A", True + ) + mutations = mutation_filter + # LOGGER.debug("read %s " % read) + if len(mutations) > 0: + for m in mutations: + # print m + self.mutationCount += 1 + m_key = "_".join([m.chr, str(m.start), m.strand, m.type]) + if m_key in self.mutations: + self.mutations[m_key].increaseScore() + else: + self.mutations[m_key] = m + + def updateCLIPinfo(self, read, matchlen, miscount): + """Update sample coverage info, clustering, mutation info.""" + # LOGGER.debug("read %s, cigar %s,mismatch %d" % (read.qname,read.cigar,miscount)) + # update sample coverage info + self.coverage += matchlen + # update cluster info + self.updateCluster(read) + # update mutation info + if miscount > 0: + self.updateMutation(read, miscount) + + def addSigToDic(self, dic, mu): + """Add new mutation into the dictionary.Mutations should be sorted.""" + if mu.chr in dic: + dic[mu.chr].append(mu) + else: + dic[mu.chr] = [mu] + + def getCrosslinking(self): + """Merge enriched clusters and reliable mutations together Call + Enrich.fisherTest() to calculate joint p vlaue.""" + for cluster in self.clusters: + # LOGGER.debug("P value of cluster is %f" % cluster.pvalue) + if cluster.sig and cluster.chr in self.sigMutations: + for mutation in self.sigMutations[cluster.chr]: + # LOGGER.debug("Cluster loc %d,%d ; Mutation loc %d,%d, mutation type %s" % (cluster.start,cluster.stop,mutation.start,mutation.stop,mutation.type)) + if cluster.overlap(mutation): + # LOGGER.debug("Overlapped") + if self.type == 0: # HITS-CLIP + mutation_key = mutation.type.split("->")[0] + if mutation_key in ["A", "C", "G", "T"]: + mutation_key = "Substitution" + cross_key = cluster.name + "_" + mutation_key + else: + cross_key = cluster.name + if cross_key in self.crosslinking: + # LOGGER.debug("Existing mutation pvalue:",self.crosslinking[cluster.name].mutationP) + self.crosslinking[cross_key].addMutation(mutation) + self.crosslinkingMutations.append(mutation) + else: + self.crosslinking[ + cross_key + ] = Alignment.CrosslinkingBed( + cluster.chr, + cluster.start, + cluster.stop, + cluster.name, + cluster.score, + cluster.strand, + cluster.pvalue, + cluster.qvalue, + mutation.start, + mutation.name, + mutation.pvalue, + ) + self.crosslinkingMutations.append(mutation) + # start to calculate fisher test p value + for k in self.crosslinking.keys(): + self.crosslinking[k].fishertest() + + def filter(self, matchLen, mismatch, cliptype, duprm): + """Filter the input BAM file according to parameters. + + Make clusters and mutations at the same time + """ + LOGGER.info("Start to filter alignment using parameters:") + LOGGER.info("match length:%d" % (matchLen)) + LOGGER.info("mismatch count: %d" % (mismatch)) + LOGGER.info("CLIP type:%s" % (str(cliptype))) + LOGGER.info("Rmdup code:%s" % (str(duprm))) + LOGGER.info( + "There are %d reads in origianl input file" + % (self.originalBAM.mapped) + ) + # print >> sys.stderr,zip(self.originalBAM.references,self.originalBAM.lengths) + # sys_info = os.system("free -m") + # LOGGER.debug(sys_info) + # outBAM = pysam.Samfile(outprefix+".filtered.bam","wb",template=self.originalBAM) + self.originalMapped = self.originalBAM.mapped + outBAM_pos = pysam.Samfile( + self.outprefix + ".pos.filtered.bam", + "wb", + template=self.originalBAM, + ) + outBAM_neg = pysam.Samfile( + self.outprefix + ".neg.filtered.bam", + "wb", + template=self.originalBAM, + ) + self.type = cliptype + if cliptype == 3: # make sure there is no rmdup for iCLIP data + duprm = 0 + count = 0 + start_time = datetime.datetime.now() + for alignment in self.originalBAM: + # print "Now processing",alignment.qname + if not alignment.cigar: # reads is unmapped + continue + count += 1 + if count % 500000 == 0: + stop_time = datetime.datetime.now() + LOGGER.debug( + "Processed %d reads in %s" + % (count, str(stop_time - start_time)) + ) + start_time = stop_time + flag, mlen, mis = Utils.readQuaFilter( + alignment, matchLen, mismatch + ) + if flag: + # print "Qualified read" + # print alignment + # print "current Gourp key",self.currentGroupKey + if duprm > 0: + # get group key + if duprm == 1: + groupkey = Utils.rmdupKey_Start(alignment) + elif duprm == 2: + groupkey = Utils.rmdupKey_Seq(alignment) + # check current group + if ( + groupkey == self.currentGroupKey + ): # overlap with current group, update group + self.updateCurrentGroup(alignment, mlen, mis) + else: # get read from current group and discard it, use current read to start a new current group + if ( + self.currentGroupKey != "None" + ): # current group exists + keep = self.rmdup() + # LOGGER.debug("Pop out read to keep %s" % keep) + self.currentGroup = [] + self.filteredAlignment += 1 + flag, mlen, mis = Utils.readQuaFilter( + keep, matchLen, mismatch + ) + self.updateCLIPinfo(keep, mlen, mis) + # outBAM.write(keep) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + self.iniDupGroupInfo( + alignment, groupkey, mlen, mis + ) # make new group using current alignment + else: # there is no rmdup + # LOGGER.debug("Good read, update clip info %s" % read.qname) + self.filteredAlignment += 1 + self.updateCLIPinfo(alignment, mlen, mis) + # outBAM.write(alignment) + if alignment.is_reverse: + outBAM_neg.write(alignment) + else: + outBAM_pos.write(alignment) + # clean up the final dupGroup, if rmdup==0, there is no final dupGroup + if len(self.currentGroup) > 0: + keep = self.rmdup() + self.currentGroup = [] + self.filteredAlignment += 1 + flag, mlen, mis = Utils.readQuaFilter(keep, matchLen, mismatch) + self.updateCLIPinfo(keep, mlen, mis) + # outBAM.write(alignment) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + # LOGGER CLIP sample information + # outBAM.close() + outBAM_pos.close() + outBAM_neg.close() + # pysam.index(outprefix+".filtered.bam") + pysam.index(self.outprefix + ".pos.filtered.bam") + pysam.index(self.outprefix + ".neg.filtered.bam") + self.posfilteredBAM = self.outprefix + ".pos.filtered.bam" + self.negfilteredBAM = self.outprefix + ".neg.filtered.bam" + self.clusters = self.clusters_plus + self.clusters_minus + self.printClusters() + self.printMutations_chr() + # Clean up variables + self.originalBAM = None + self.clusters_plus = None + self.clusters_minus = None + self.mutations = None + gc.collect() + LOGGER.debug( + "After filtering, %d reads left" % (self.filteredAlignment) + ) + LOGGER.debug("There are %d clusters in total" % (self.clusterCount)) + LOGGER.debug("There are %d mutations in total" % (self.mutationCount)) + + def filter2(self, matchLen, mismatch, cliptype, duprm): + """Filter the input BAM file according to parameters. + + Make clusters and mutations at the same time + """ + LOGGER.info("Start to filter alignment using parameters:") + LOGGER.info("match length:%d" % (matchLen)) + LOGGER.info("mismatch count: %d" % (mismatch)) + LOGGER.info("CLIP type:%s" % (str(cliptype))) + LOGGER.info("Rmdup code:%s" % (str(duprm))) + LOGGER.info( + "There are %d reads in origianl input file" + % (self.originalBAM.mapped) + ) + + self.originalMapped = self.originalBAM.mapped + outBAM_pos = pysam.Samfile( + self.outprefix + ".pos.filtered.bam", + "wb", + template=self.originalBAM, + ) + outBAM_neg = pysam.Samfile( + self.outprefix + ".neg.filtered.bam", + "wb", + template=self.originalBAM, + ) + self.type = cliptype + if cliptype == 3: # make sure there is no rmdup for iCLIP data + duprm = 0 + count = 0 + start_time = datetime.datetime.now() + currentChr = "" + for alignment in self.originalBAM: + if not alignment.cigar: # reads is unmapped + continue + c_chr = self.originalBAM.getrname(alignment.tid) + if currentChr == "": + currentChr = c_chr + elif c_chr != currentChr: + # LOGGER.debug("Mutation sites for Current ref %s is %d" % (currentChr,len(self.mutations.keys()))) + self.printMutationSingleChr(currentChr) + self.mutations = None + gc.collect() + self.mutations = {} + currentChr = c_chr + count += 1 + if count % 5000000 == 0: + stop_time = datetime.datetime.now() + LOGGER.debug( + "Processed %d reads in %s" + % (count, str(stop_time - start_time)) + ) + start_time = stop_time + flag, mlen, mis = Utils.readQuaFilter( + alignment, matchLen, mismatch + ) + if flag: + if duprm > 0: + # get group key + if duprm == 1: + groupkey = Utils.rmdupKey_Start(alignment) + elif duprm == 2: + groupkey = Utils.rmdupKey_Seq(alignment) + # check current group + if ( + groupkey == self.currentGroupKey + ): # overlap with current group, update group + self.updateCurrentGroup(alignment, mlen, mis) + else: # get read from current group and discard it, use current read to start a new current group + if ( + self.currentGroupKey != "None" + ): # current group exists + keep = self.rmdup() + # LOGGER.debug("Pop out read to keep %s" % keep) + self.currentGroup = [] + self.filteredAlignment += 1 + flag, mlen, mis = Utils.readQuaFilter( + keep, matchLen, mismatch + ) + self.updateCLIPinfo(keep, mlen, mis) + # outBAM.write(keep) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + self.iniDupGroupInfo( + alignment, groupkey, mlen, mis + ) # make new group using current alignment + else: # there is no rmdup + # LOGGER.debug("Good read, update clip info %s" % read.qname) + self.filteredAlignment += 1 + self.updateCLIPinfo(alignment, mlen, mis) + # outBAM.write(alignment) + if alignment.is_reverse: + outBAM_neg.write(alignment) + else: + outBAM_pos.write(alignment) + # clean up the final dupGroup, if rmdup==0, there is no final dupGroup + if len(self.currentGroup) > 0: + keep = self.rmdup() + self.currentGroup = [] + self.filteredAlignment += 1 + flag, mlen, mis = Utils.readQuaFilter(keep, matchLen, mismatch) + self.updateCLIPinfo(keep, mlen, mis) + # outBAM.write(alignment) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + # clean up current mutation hash + self.printMutationSingleChr(currentChr) + self.mutations = None + # LOGGER CLIP sample information + # outBAM.close() + outBAM_pos.close() + outBAM_neg.close() + pysam.index(self.outprefix + ".pos.filtered.bam") + pysam.index(self.outprefix + ".neg.filtered.bam") + self.posfilteredBAM = self.outprefix + ".pos.filtered.bam" + self.negfilteredBAM = self.outprefix + ".neg.filtered.bam" + self.clusters = self.clusters_plus + self.clusters_minus + self.printClusters() + # Clean up variables + self.originalBAM = None + self.clusters_plus = None + self.clusters_minus = None + self.mutations = None + gc.collect() + LOGGER.debug( + "After filtering, %d reads left" % (self.filteredAlignment) + ) + LOGGER.debug("There are %d clusters in total" % (self.clusterCount)) + LOGGER.debug("There are %d mutations in total" % (self.mutationCount)) diff --git a/pipeclip/Enrich.py b/pipeclip/Enrich.py new file mode 100644 index 0000000..171d271 --- /dev/null +++ b/pipeclip/Enrich.py @@ -0,0 +1,484 @@ +#!/usr/bin/python +# Programmer : beibei.chen@utsouthwestern.edu +# Usage: Get reliable mutations using binomial distribution +# Input: Filtered BAM, reads coverage (generated by SAMFilter.py), mutation file +# Output: BED +# Last modified: 18 Sep.2013 + + +import datetime +import gc +import math +import os +import subprocess +import sys +from collections import Counter +from pathlib import Path + +import pysam +import rpy2.robjects as robject +from rpy2.robjects.packages import importr + +from . import Alignment, Utils + +LOGGER = Utils.get_logger(__name__) + +gc.enable() + +stats = importr("stats") + + +def freqRank(readCount, rev=False): + key = sorted(list(readCount.keys()), reverse=rev) + r_rank = {} + rank = 0 + for i in key: + rank += readCount[i] + r_rank[i] = rank + return r_rank + + +def BH(pvalue, pRank, N): + a = N / float(pRank) + q = a * pvalue + qva = max(pvalue, q) + return qva + + +# @profile +def KMvalue(posmapfile, negmapfile, mufile): + """Calculate K(coverage) value for each mutation location Mutations are + already unique.""" + km = [] # store mutations with updated k value + km_pair = {} # Dic of count tuples of (k,m),key:"K_M" + count = 0 + LOGGER.debug("make wig %s" % str(datetime.datetime.now())) + poswig = Utils.makeWig(posmapfile) + negwig = Utils.makeWig(negmapfile) + start_time = datetime.datetime.now() + LOGGER.debug("finish making wig %s" % str(start_time)) + for item in mufile: + count += 1 + if count % 5000 == 0: + stop_time = datetime.datetime.now() + LOGGER.debug( + "Counting K-M for %d mutation sites, using %s" + % (count, str(stop_time - start_time)) + ) + start_time = stop_time + st = [] + strand = item.strand + M = item.score + K = 0 + # LOGGER.debug("Time begin to pileup is %s" % (str(datetime.datetime.now()))) + if strand == "+": + try: + K = poswig[item.chr][str(item.start)] + except: + continue + + elif strand == "-": + try: + K = negwig[item.chr][str(item.start)] + except: + continue + if K >= M: + item.updateK(K) + # print item + # LOGGER.debug("K value for item %s is %d" % (item, K)) + pair_name = str(K) + "_" + str(M) + if pair_name in km_pair: + km_pair[pair_name] += 1 + else: + km_pair[pair_name] = 1 + # km.append(item) + gc.collect() + return km_pair + + +def KMvalue_test(clip, mutations, chr, chrlen): + """Calculate K(coverage) value for each mutation location Mutations are + already unique.""" + km = [] # store mutations with updated k value + count = 0 + # LOGGER.debug("make wig %s" % str(datetime.datetime.now())) + posBAM = pysam.Samfile(clip.posfilteredBAM, "rb") + negBAM = pysam.Samfile(clip.negfilteredBAM, "rb") + start_time = datetime.datetime.now() + poswig = Utils.makeWigByChr(posBAM, chr) + negwig = Utils.makeWigByChr(negBAM, chr) + stop_time = datetime.datetime.now() + # LOGGER.debug("Finished making wig for %s using %s" % (chr,str(stop_time-start_time))) + start_time = stop_time + for item in mutations: + count += 1 + if count % 100000 == 0: + stop_time = datetime.datetime.now() + LOGGER.debug( + "Counting K-M for %d mutation sites, using %s" + % (count, str(stop_time - start_time)) + ) + start_time = stop_time + st = [] + M = item.score + K = 0 + strand = item.strand + if strand == "+": + try: + K = poswig[item.start] + except: + # log.warning("Did not find mutation in poswig") + continue + elif strand == "-": + try: + K = negwig[item.start] + except: + continue + if K >= M: + item.updateK(K) + pair_name = str(K) + "_" + str(M) + if pair_name in clip.kmpair: + clip.kmpair[pair_name] += 1 + else: + clip.kmpair[pair_name] = 1 + posBAM.close() + negBAM.close() + + +def uniq(b): # b is a list + uniqElements = [] + for i in b: + if uniqElements.count(i) == 0: + uniqElements.append(i) + uniqElements.sort() + return uniqElements + + +def mutationEnrich(clip, threshold=0.01): + coverage = clip.coverage * 1.0 + totalMuCount = clip.mutationCount + mutations = [] + total_test = 0 + for chr, chrlen in clip.refInfo: + # LOGGER.debug(chr) + try: + mufile = open(clip.outprefix + "." + chr + ".mutations.bed") + except: + LOGGER.info( + "Cannot open mutation file %s , move on." + % (clip.outprefix + "." + chr + ".mutations.bed") + ) + continue + for record in mufile: + total_test += 1 + info = record.rstrip().split("\t") + new_mu = Alignment.MutationBed( + info[0], + int(info[1]), + int(info[2]), + info[3], + int(info[4]), + info[5], + info[6], + ) + mutations.append(new_mu) + try: + pass + # os.remove(clip.outprefix+"."+chr+".mutations.bed") + except: + pass + LOGGER.debug(len(mutations)) + KMvalue_test( + clip, mutations, chr, chrlen + ) # check after doing KM, if clip.mutations changed + try: + os.remove(clip.posfilteredBAM) + os.remove(clip.negfilteredBAM) + os.remove(clip.posfilteredBAM + ".bai") + os.remove(clip.negfilteredBAM + ".bai") + except: + pass + del clip.posfilteredBAM + del clip.negfilteredBAM + gc.collect() # LOGGER.info("Finished K-M counting, starting fitting.") + + R = robject.r + reliableList = [] + P = totalMuCount / coverage + km_p = {} # store km and corresponding p value + pvalues = [] + for k in clip.kmpair: # KM_test: + parameters = k.split("_") + p = R.pbinom(int(parameters[1]) - 1, int(parameters[0]), P, False)[0] + pvalues.append(p) + km_p[k] = p + pCount = dict(Counter(pvalues)) + pRank = freqRank(pCount, True) + pqDic = {} + for i in list(pRank.keys()): + try: + p_rank = pRank[i] + q_value = BH(i, p_rank, total_test) + pqDic[i] = q_value + except: + print("Cannot find p value in dictionary", file=sys.stderr) + continue + count = 0 + for mu in mutations: + name = str(mu.kvalue) + "_" + str(mu.score) + try: + mu.pvalue = km_p[name] + except: + # LOGGER.debug("Problem with %s" % mu) + continue + mu.qvalue = pqDic[mu.pvalue] + if mu.qvalue <= threshold: + count += 1 + new_mutationName = "Mutation_" + str(count) + mu.name = new_mutationName + mu.sig = True + clip.sigMutationCount += 1 + clip.addSigToDic(clip.sigMutations, mu) + + mutations = None + gc.collect() + + +def mutationEnrichWCtrl(clip, ctrlclip, threshold=0.01): + coverage = ctrlclip.coverage * 1.0 + totalMuCount = ctrlclip.mutationCount + mutations = [] + total_test = 0 + for chr, chrlen in clip.refInfo: + try: + mufile = open(clip.outprefix + "." + chr + ".mutations.bed") + except: + LOGGER.info( + "Cannot open mutation file %s , move on." + % (clip.outprefix + "." + chr + ".mutations.bed") + ) + continue + for record in mufile: + total_test += 1 + info = record.rstrip().split("\t") + new_mu = Alignment.MutationBed( + info[0], + int(info[1]), + int(info[2]), + info[3], + int(info[4]), + info[5], + info[6], + ) + mutations.append(new_mu) + try: + os.remove(clip.outprefix + "." + chr + ".mutations.bed") + os.remove(ctrlclip.outprefix + "." + chr + ".mutations.bed") + except: + pass + KM_test = KMvalue_test( + clip, mutations, chr, chrlen + ) # check after doing KM, if clip.mutations changed + try: + os.remove(clip.posfilteredBAM) + os.remove(clip.negfilteredBAM) + os.remove(clip.posfilteredBAM + ".bai") + os.remove(clip.negfilteredBAM + ".bai") + except: + pass + del clip.posfilteredBAM + del clip.negfilteredBAM + gc.collect() # LOGGER.info("Finished K-M counting, starting fitting.") + + R = robject.r + reliableList = [] + P = totalMuCount / coverage + km_p = {} # store km and corresponding p value + pvalues = [] + for k in clip.kmpair: # KM_test: + parameters = k.split("_") + p = R.pbinom(int(parameters[1]) - 1, int(parameters[0]), P, False)[0] + pvalues.append(p) + km_p[k] = p + pCount = dict(Counter(pvalues)) + pRank = freqRank(pCount, True) + pqDic = {} + for i in list(pRank.keys()): + try: + p_rank = pRank[i] + q_value = BH(i, p_rank, total_test) + pqDic[i] = q_value + except: + print("Cannot find p value in dictionary", file=sys.stderr) + continue + count = 0 + for mu in mutations: + name = str(mu.kvalue) + "_" + str(mu.score) + try: + mu.pvalue = km_p[name] + except: + # LOGGER.debug("Problem with %s" % mu) + continue + mu.qvalue = pqDic[mu.pvalue] + if mu.qvalue <= threshold: + count += 1 + new_mutationName = "Mutation_" + str(count) + mu.name = new_mutationName + mu.sig = True + clip.sigMutationCount += 1 + clip.addSigToDic(clip.sigMutations, mu) + + mutations = None + gc.collect() + + +def clusterEnrich(clip, threshold=0.01): + cluster_filename = ( + clip.outprefix + ".clusters.bed" + ) # clip.filepath.split("/")[-1].split(".")[0] + # Call R code and get result + epsilon = [0.01, 0.15, 0.1] + step = [0.1, 0.08, 0.05] + for index in range(len(epsilon)): + e = epsilon[index] + s = step[index] + r_script = Path(__file__).parent.resolve() / "ZTNB_tryCatch.R" + r_args = [ + "Rscript", + r_script, + cluster_filename, + str(threshold), + str(e), + str(s), + ] + gc.collect() + p = subprocess.Popen(r_args) + stdout_value = p.communicate()[0] + try: + r_output_log = open(cluster_filename + ".pipeclip.ztnblog", "r") + # LOGGER.debug("Log file opened") + flag = r_output_log.read(1) + if flag == "Y": # converged + break + elif flag == "N": + continue + except: + LOGGER.info( + "No log file was produced by R code, continue regression using other parameters anyway." + ) + continue + + # check ztnb file + try: + enrich_parameter = open(cluster_filename + ".pipeclip.ztnb", "r") + except IOError as message: + LOGGER.error("Cannot open ztnb result file") + return False + nbDic = {} + for item in enrich_parameter: + buf = item.rstrip().split("\t") + if buf[0] != "#": + nb_key = "_".join(buf[0:2]) # reads_length as key + # LOGGER.debug("NB_key %s" % nb_key) + if nb_key not in nbDic: + nbDic[nb_key] = (buf[2], buf[3]) # pvalue and qvalue + # LOGGER.info("There are %d read-length pairs" % (len(nbDic.keys()))) + if len(list(nbDic.keys())) == 0: + LOGGER.error("There are no read-length pairs found by ZTNB. Exit.") + return False + else: + for i in range(len(clip.clusters)): + r_key = ( + str(clip.clusters[i].score) + + "_" + + str(clip.clusters[i].stop - clip.clusters[i].start) + ) + # LOGGER.debug("Keys from clip.clusters,%s" % r_key) + if r_key in nbDic: + clip.clusters[i].pvalue = nbDic[r_key][0] + clip.clusters[i].qvalue = nbDic[r_key][1] + clip.clusters[i].sig = True + clip.sigClusterCount += 1 + nbDic = None + try: + os.remove(cluster_filename) + except: + pass + if clip.sigClusterCount == 0: + return False + else: + return True + + +def clusterEnrich_outsource(clip, threshold=0.01): + cluster_filename = clip.outprefix + ".clusters.bed" + # Call R code and get result + epsilon = [0.01, 0.15, 0.1] + step = [0.1, 0.08, 0.05] + for e in epsilon: + for s in step: + r_script = Path(__file__).parent.resolve() / "ZTNB_tryCatch.R" + r_args = [ + "Rscript", + r_script, + cluster_filename, + str(threshold), + str(e), + str(s), + ] + p = subprocess.Popen(r_args) + stdout_value = p.communicate()[0] + + # check ztnb file + try: + enrich_parameter = open(cluster_filename + ".pipeclip.ztnb", "r") + except IOError: + LOGGER.error("Cannot open ztnb result file") + return False + nbDic = {} + for item in enrich_parameter: + buf = item.rstrip().split("\t") + if buf[0] != "#": + nb_key = "_".join(buf[0:2]) # reads_length as key + # LOGGER.debug("NB_key %s" % nb_key) + if nb_key not in nbDic: + nbDic[nb_key] = (buf[2], buf[3]) # pvalue and qvalue + # LOGGER.info("There are %d read-length pairs" % (len(nbDic.keys()))) + if len(list(nbDic.keys())) == 0: + LOGGER.error("There are no read-length pairs found by ZTNB. Exit.") + return False + else: + for i in range(len(clip.clusters)): + r_key = ( + str(clip.clusters[i].score) + + "_" + + str(clip.clusters[i].stop - clip.clusters[i].start) + ) + # LOGGER.debug("Keys from clip.clusters,%s" % r_key) + if r_key in nbDic: + clip.clusters[i].pvalue = nbDic[r_key][0] + clip.clusters[i].qvalue = nbDic[r_key][1] + clip.clusters[i].sig = True + clip.sigClusterCount += 1 + nbDic = None + try: + os.remove(cluster_filename) + except: + pass + if clip.sigClusterCount == 0: + return False + else: + return True + + +def fisherTest(clusterp, mutationp): + R = robject.r + min_mp = min(mutationp) + product = clusterp * min_mp + if product == 0: + fps = 0 + else: + xsq = -2 * math.log(clusterp * min_mp) + fp = R.pchisq(xsq, **{"df": 4, "lower.tail": False, "log.p": True})[0] + fps = math.exp(fp) + return fps diff --git a/pipeclip/Utils.py b/pipeclip/Utils.py new file mode 100644 index 0000000..754e446 --- /dev/null +++ b/pipeclip/Utils.py @@ -0,0 +1,152 @@ +#!/usr/bin/python +# programmer: beibei.chen@utsouthwestern.edu +# Usage: definition of utilities to handel pysam classes + +import logging +import subprocess +import sys + +from . import Alignment + +try: + assert sys.version_info > (3, 6) +except AssertionError: + raise RuntimeError("Requires Python 3.6+!") + + +def get_logger(name: str) -> logging.Logger: + """global logging.""" + logger: logging.Logger = logging.getLogger(name) + if not logger.handlers: + handler: logging.StreamHandler = logging.StreamHandler() + formatter: logging.Formatter = logging.Formatter( + "%(asctime)s %(name)-12s %(levelname)-8s %(message)s" + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + # logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) + return logger + + +LOGGER: logging.Logger = get_logger(__name__) + + +def is_sorted(header): + """Get a BAM header, and check if this file is sorted or not + Return: True: sorted, False: unsorted + Header variable is a list + """ + for row in header.split("\n"): + buf = row.rstrip().split("\t") + if buf[0] == "@HD": + if buf[2] in ["SO:unsorted", "SO:unknown"]: + return False + elif buf[2] in ["SO:coordinate", "SO:queryname"]: + return True + # If no HD header contains SO info, return False + return False + + +def readQuaFilter(read, mlen, mis): + matchlen = 0 + mismatch = 0 + # LOGGER.debug("read %s,cigar %s,tags %s" % (read.qname,read.cigar,read.tags)) + try: + for i in read.cigar: + # LOGGER.debug(i) + if i[0] == 0: + matchlen += i[1] + elif i[0] == 1: + mismatch += i[1] + for j in read.tags: + if j[0] == "NM": + mismatch += j[1] + break + # LOGGER.debug("matchlen %d,mismatch %d" % (matchlen,mismatch)) + except: + # LOGGER.debug("Problematic read %s" % (read)) + pass + if matchlen >= mlen and mismatch <= mis: + return (True, matchlen, mismatch) + else: + return (False, 0, 0) + + +def rmdupKey_Start(read): + # print >>sys.stderr,"Remove duplicate by alignment start" + k = str(read.tid) + ":" + str(read.pos) + ":" + str(read.is_reverse) + # print >>sys.stderr,k + return k + + +def rmdupKey_Seq(read): + k = ( + str(read.tid) + + ":" + + str(read.pos) + + ":" + + str(read.is_reverse) + + ":" + + read.seq + ) + return k + + +def filterMutations(mlist, feature, keep=True): + newList = [] + for i in mlist: + if i.type == feature: + if keep: + newList.append(i) + else: + if not keep: + newList.append(i) + return newList + + +def annotation(fileaddr, species): + """Call Homer annotation script.""" + outfile = open(fileaddr + ".anno", "w") + args = ["annotatePeaks.pl", fileaddr, species] + p = subprocess.Popen(args, stdout=outfile) + stdout_value = p.communicate()[0] + outfile.close() + + +def makeWig(bamfile): + wig = {} + for r in bamfile.references: + wig[r] = {} + for pileupColumn in bamfile.pileup(r): + wig[r][str(pileupColumn.pos)] = pileupColumn.n + return wig + + +def makeWigByChr(bamfile, chr): + wig = {} + for pileupColumn in bamfile.pileup(chr): + wig[pileupColumn.pos] = pileupColumn.n + return wig + + +def makeWigListByChr(bamfile, chr): + wig = Alignment.wiglist() + for pileupColumn in bamfile.pileup(chr): + wig.update(pileupColumn.pos, pileupColumn.n) + return wig + + +def makeWigListByChr_array(bamfile, chr, chrlen): + wig = [] + for _ in range(chrlen): + wig.append(0) + for pileupColumn in bamfile.pileup(chr): + wig[pileupColumn.pos] = pileupColumn.n + return wig + + +def bisort(alist, b): + """List is a list of bed instances, b is also an instance.""" + if isinstance(alist[0], Alignment.BED) and isinstance(b, Alignment.BED): + pass diff --git a/pipeclip/ZTNB_tryCatch.R b/pipeclip/ZTNB_tryCatch.R new file mode 100644 index 0000000..70f629c --- /dev/null +++ b/pipeclip/ZTNB_tryCatch.R @@ -0,0 +1,188 @@ +#!/usr/bin/env Rscript +# -*- coding: utf-8 -*- + +# Require at least R3.0 and package VGAM and dependencies +# Programmer: jonghyun.yun@utsouthwestern.edu and beibei.chen@utsouthwestern.edu +# Usage: Get all the reads count and length combinations of enriched clusters +# Input: BED file with score column as reads number +# Output: table with reads count, length, p value and fdr +# Last modified: 20 Sep 2014 + +# command line parameters: +# 1:input file name. After running the code, two output files: input.ztnb and input.ztnblog will be created +# 2:FDR cutoff +# 3:Regression epsilon +# 4:Regression step + +required_packages <- c("MASS", "VGAM") +for (p in required_packages) { + if (!require(p, character.only = TRUE)) { + install.packages(p, repos = "https://cloud.r-project.org") + suppressMessages(require(p, character.only = TRUE)) + } +} + +args <- commandArgs(TRUE) +data <- read.table(args[1], sep = "\t") +len <- data[, 3] - data[, 2] +read <- data[, 5] +cut <- as.numeric(as.character(args[2])) +vglm_epsilon <- as.numeric(as.character(args[3])) +vglm_step <- as.numeric(as.character(args[4])) +# tabularize the data +wts <- table(read, len) +temp_coln <- as.integer(names(wts[1, ])) +temp_rown <- as.integer(names(wts[, 1])) +drow <- length(temp_rown) +dcol <- length(temp_coln) +rown <- matrix(rep(temp_rown, dcol), drow, dcol) +coln <- t(matrix(rep(temp_coln, drow), dcol, drow)) +wdx <- which(wts > 0) +tread <- rown[wdx] +tlen <- coln[wdx] +twts <- wts[wdx] +# print("local polynomial regression") +# local polynomial regression: read ~ len +# smoothing parameter: +alp <- 0.95 +# polynomial degree: +pd <- 1 +lregfit <- loess( + tread ~ tlen, + weights = twts, + span = alp, + family = "symmetric", + degree = pd +) +mu <- lregfit$fitted +mu[mu <= 0] <- min(exp(-4), mu[mu > 0]) +# bounded mean function +logmu <- log(mu) +# compute p-values using N(0,1) for (tread[-cdx] - mu[-cdx]) / sqrt(mu[-cdx]) +nb.pvalue <- numeric(length(tread)) +zscore <- (tread - mu) / sqrt(mu) +cdx <- which(zscore < 4) +nb.pvalue[-cdx] <- pnorm(zscore[-cdx], lower.tail = FALSE) +tread_fit <- tread[cdx] +tlen_fit <- tlen[cdx] +twts_fit <- twts[cdx] +lregfit <- loess( + tread_fit ~ tlen_fit, + weights = twts_fit, + span = alp, + family = "symmetric", + degree = pd +) +mu <- lregfit$fitted +mu[mu <= 0] <- min(exp(-4), mu[mu > 0]) +# bounded mean function +logmu <- log(mu) +# negative binomail regression with the known predictor log(mu) + +intercept1 <- seq(-1, 1, 0.5) +intercept2 <- seq(-1, 1, 0.5) +biggest_likelihood <- -99999999 +khat <- 0 +muhat <- 0 +converge_flag <- FALSE +errmsg <- paste("Error/warning messages for run using Epsilon:", vglm_epsilon, "and step size:", vglm_step, ".") +options(warn = 1) +# use default setting first to see if it could converge +nb <- tryCatch( + { + vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step, offset = logmu, epsilon = vglm_epsilon, silent = FALSE) + }, + warning = function(w) { + # print(paste("typeof warning",typeof(w["message"]))) + warnmsg <- strsplit(toString(w["message"]), " iteration")[[1]][1] + if (grepl("convergence not obtained", warnmsg)) { + newmsg <- paste("Convergence not obtained in 200 iterarions") + return(newmsg) + } + }, + error = function(e) { + newmsg <- paste("Program failed to converge.") + return(newmsg) + }, + finally = { + } +) # end of try-catch +if (length(nb) > 0) { + if (typeof(nb) == "S4") { + # print("Converged, get regression class") + if (logLik(nb) >= biggest_likelihood & Coef(nb)["size"] <= 100) { + biggest_likelihood <- logLik(nb) + khat <- Coef(nb)["size"] + muhat <- Coef(nb)["munb"] + } + } else if (typeof(nb) == "character") { + errmsg <- paste(errmsg, nb, sep = "\n") + } +} + + +if ((biggest_likelihood == -99999999) && (khat == 0) && (muhat == 0)) { # try different coef start + for (i in intercept1) + { + for (j in intercept2) + { + nb <- tryCatch( + { + vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step, offset = logmu, epsilon = vglm_epsilon, silent = FALSE, coefstart = c(i, j)) + }, + warning = function(w) { + warnmsg <- strsplit(toString(w["message"]), " iteration")[[1]][1] + if (grepl("convergence not obtained", warnmsg)) { + newmsg <- paste("Using coefstart c(", i, ",", j, "), convergence not obtained in 200 iterarions") + return(newmsg) + } + }, + error = function(e) { + newmsg <- paste("Using coefstart c(", i, ",", j, "), program failed to converge.") + return(newmsg) + }, + finally = { + } + ) # end of try-catch + if (length(nb) > 0) { + if (typeof(nb) == "S4") { + # print("Converged, get regression class") + if (logLik(nb) >= biggest_likelihood & Coef(nb)["size"] <= 100) { + biggest_likelihood <- logLik(nb) + khat <- Coef(nb)["size"] + muhat <- Coef(nb)["munb"] + } + } else if (typeof(nb) == "character") { + errmsg <- paste(errmsg, nb, sep = "\n") + } + } + } # end of loop intercept2 + } # end of loop intercept1 +} # end of try different coefstart + +if ((biggest_likelihood == -99999999) && (khat == 0) && (muhat == 0)) { + # No model converged,exit + errmsg <- paste("N:No model converged for this run.", errmsg, sep = "\n") +} else { + # model parameters + errmsg <- paste("Y:Coverged for this run.", errmsg, sep = "\n") + write(errmsg, paste(args[1], ".Converge.txt", sep = "")) + nbsize <- khat * mu + nbmu <- muhat * mu + nb.pvalue[cdx] <- pnbinom( + tread_fit - 1, + size = nbsize, + mu = nbmu, + lower.tail = FALSE + ) / (1 - dnbinom(0, size = nbsize, mu = nbmu)) + nb.fdr <- p.adjust(nb.pvalue, method = "BH") + # output + nbdx <- nb.fdr <= cut + # sum(twts[nbdx]); # the number of clusters whose FDR <= cut + # read counts and cluster length of clusters whose FDR <= cut + nb.out <- as.data.frame(matrix(c(tread[nbdx], tlen[nbdx], nb.pvalue[nbdx], nb.fdr[nbdx]), sum(nbdx), 4)) + names(nb.out) <- c("read", "length", "p", "fdr") + outname <- paste(args[1], ".pipeclip.ztnb", sep = "") + write.table(nb.out, outname, sep = "\t", quote = F, row.names = F) +} # end of output +write(errmsg, paste(args[1], ".pipeclip.ztnblog", sep = "")) diff --git a/pipeclip/__init__.py b/pipeclip/__init__.py new file mode 100644 index 0000000..2175132 --- /dev/null +++ b/pipeclip/__init__.py @@ -0,0 +1,2 @@ +# indicate this folder contains packages +from . import CLIP, Alignment, Enrich, Utils, mutation diff --git a/pipeclip/barcode_removal.py b/pipeclip/barcode_removal.py new file mode 100644 index 0000000..d8de64f --- /dev/null +++ b/pipeclip/barcode_removal.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# Programmer : beibei.chen@utsouthwestern.edu +# Usage: remove PCR duplicates by comparing the sequences and return fastq file with barcode removed. +# Input: fastq file +# Output: fastq file +# Last modified: 28 June 2014 - Beibei Chen + +import sys + + +class fastq: + def __init__(self, x): + self.id = x[0] + self.seq = x[1] + self.name = x[2] + self.quality = x[3] + + def __str__(self): + st = self.id + self.seq + self.name + self.quality + return st + + +class fqList: + def __init__(self): + self.data = [] + self.unique = {} + + def readFq(self, fh, bl): + st = [] + buf = fh.readlines() + for i in range(len(buf)): + if buf[i][0] == "@" and buf[i + 2][0] == "+": + st.append(buf[i]) + st.append(buf[i + 1]) + st.append(buf[i + 2]) + st.append(buf[i + 3].rstrip()) + self.addFq(st) + if buf[i + 1] not in self.unique: + self.addUniqFq(st, bl) + st = [] + + def addFq(self, s): + newFq = fastq(s) + self.data.append(newFq) + + def addUniqFq(self, s, b): + seq_key = s[1] + offset = b + newFq = fastq([s[0], s[1][offset:], s[2], s[3][offset:]]) + self.unique[seq_key] = newFq + + +class barcodeRemover: + def __init__(self, infile, barLen): + self.infile = infile + self.barLen = barLen + + def run(self): + myfq = fqList() + myfq.readFq(self.infile, self.barLen) + for item in list(myfq.unique.values()): + if len(item.seq) >= 15: + print(item) + + +def barCodeRemovalMain(infilePath, barLenStr): + try: + infile = open(infilePath, "r+") + except IOError as message: + print("cannot open file", message, file=sys.stderr) + sys.exit(1) + try: + barLen = int(barLenStr) + except: + barLen = 5 + barcodeRemover = barcodeRemover(infile, barLen) + barcodeRemover.run() + + +def barCodeRemovalMain(): + try: + infile = open(sys.argv[1], "r+") + except IOError as message: + print("cannot open file", message, file=sys.stderr) + sys.exit(1) + try: + barLen = int(sys.argv[2]) + except: + barLen = 5 + myBarcodeRemover = barcodeRemover(infile, barLen) + myBarcodeRemover.run() + + +if __name__ == "__main__": + barCodeRemovalMain() diff --git a/pipeclip/main.py b/pipeclip/main.py new file mode 100644 index 0000000..dcba40a --- /dev/null +++ b/pipeclip/main.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright © 2022 Ye Chang yech1990@gmail.com +# Distributed under terms of the GNU license. +# +# Created: 2022-04-26 18:14 + +"""Fork from: + +- Main pipeline connects all the scripts together +- Original Programmer: beibei.chen@utsouthwestern.edu +- Refactored by: eric.roos@utsouthwestern.edu +- Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species +- Required packages: pysam, pybedtools, rpy2 +""" + + +import pathlib +import sys + +import click + +from . import CLIP, Enrich, Utils + +LOGGER = Utils.get_logger(__name__) + + +@click.command(help="PIPECLIP v2.0.10", no_args_is_help=True) +@click.option( + "--infile", "-i", "infile", help="Input bam file.", required=True +) +@click.option( + "--control", "-t", "control", help="Control bam file.", required=False +) +@click.option( + "--output", + "-o", + "output_prefix", + help="Output file, default is stdout.", + required=True, +) +@click.option( + "--matchLength", + "-l", + "matchLength", + help="Minimum matched segment length.", + type=int, + required=True, +) +@click.option( + "--mismatch", + "-m", + "mismatch", + help="Maximum mismatch number.", + type=int, + required=True, +) +@click.option( + "--clipType", + "-c", + "clipType", + help="CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", + type=click.IntRange(0, 3), + required=True, +) +@click.option( + "--rmdup", + "-r", + "rmdup", + help="Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence;", + type=click.IntRange(0, 2), + required=True, +) +@click.option( + "--fdrMutation", + "-M", + "fdrReliableMutation", + help="FDR for reliable mutations.", + type=float, + required=True, +) +@click.option( + "--fdrCluster", + "-C", + "fdrEnrichedCluster", + help="FDR for enriched clusters.", + type=float, + required=True, +) +@click.option( + "--species", + "-s", + "species", + help="Species (hg19, mm9, mm10...)", + type=click.Choice(["mm9", "mm10", "hg19"]), + required=False, +) +def runPipeClip( + infile, + control, + output_prefix, + matchLength, + mismatch, + rmdup, + fdrEnrichedCluster, + clipType, + fdrReliableMutation, + species, +): + # make output directory + pathlib.Path(output_prefix).parent.mkdir(parents=True, exist_ok=True) + # run clip analysis + myClip = CLIP.CLIP(infile, output_prefix) + controlFlag = False + if control != None: + controlClip = CLIP.CLIP(control, output_prefix + "Control") + LOGGER.info("Start to run") + if myClip.testInput(): # check input + LOGGER.info("Input file OK,start to run PIPE-CLIP") + LOGGER.info("Species info %s" % species) + if control != None: # test control file + if controlClip.testInput(): + LOGGER.info( + "Control file OK. Use control in mutation enrichment." + ) + controlFlag = True + else: + LOGGER.info( + "Control file format error. Continue without control." + ) + if myClip.readfile(): + myClip.filter2(matchLength, mismatch, clipType, rmdup) + if controlFlag: + LOGGER.info("Read in control file") + controlClip.readfile() + controlClip.filter2(matchLength, mismatch, clipType, rmdup) + # myClip.printClusters() + # myClip.printMutations() + if myClip.clusterCount > 0: + LOGGER.info("Get enriched clusters") + status = Enrich.clusterEnrich_outsource( + myClip, fdrEnrichedCluster + ) + if status: + LOGGER.info( + "Found %d enriched clusters" % myClip.sigClusterCount + ) + myClip.printEnrichedClusters() + else: + LOGGER.error( + "There is no enriched cluster found. Exit program" + ) + sys.exit(1) + else: + LOGGER.error( + "There is no clusters found. Please check input.Exit program." + ) + sys.exit(1) + + if myClip.mutationCount > 0: + LOGGER.info("Get reliable mutations") + if controlFlag: # use control + Enrich.mutationEnrichWCtrl( + myClip, controlClip, fdrReliableMutation + ) + else: + Enrich.mutationEnrich(myClip, fdrReliableMutation) + LOGGER.info( + "There are %d reliable mutations" % myClip.sigMutationCount + ) + myClip.printReliableMutations() + else: + LOGGER.warning("There is no mutation found in this BAM file.") + # Start to get crosslinking sites + if myClip.sigClusterCount > 0 and myClip.sigMutationCount > 0: + LOGGER.info("Get cross-linking sites") + myClip.getCrosslinking() + if len(list(myClip.crosslinking.keys())) > 0: + outfilelist = myClip.printCrosslinkingSites() + myClip.printCrosslinkingMutations() + else: + LOGGER.warning( + "There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead." + ) + outfilelist = myClip.printEnrichedClusters() + else: + if myClip.sigClusterCount <= 0: + LOGGER.error( + "There is no enriched clusters for this sample, please check your input file. Exit." + ) + sys.exit(2) + elif myClip.sigMutationCount <= 0: + LOGGER.warning( + "There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates." + ) + outfilelist = myClip.printEnrichedClusters() + # annotation if possible + if species in ["mm10", "mm9", "hg19"]: + LOGGER.info("Started to annotate cross-linking sits using HOMER") + for name in outfilelist: + # LOGGER.debug("Start to do annotation for %s" % name) + Utils.annotation(name, species) + # output a status log file + logfile = open(output_prefix + ".pipeclip.summary.log", "w") + print("PIPE-CLIP run finished. Parameters are:", file=logfile) + print( + "Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" + % ( + infile, + output_prefix, + matchLength, + mismatch, + rmdup, + fdrEnrichedCluster, + fdrReliableMutation, + ), + file=logfile, + ) + print( + "There are %d mapped reads in input BAM file. After filtering,%d reads left" + % (myClip.originalMapped, myClip.filteredAlignment), + file=logfile, + ) + print( + "%d out of %d clusters are enriched." + % (myClip.sigClusterCount, len(myClip.clusters)), + file=logfile, + ) + print( + "%d out of %d mutations are reliable." + % (myClip.sigMutationCount, myClip.mutationCount), + file=logfile, + ) + print( + "%d crosslinking site candidates are found, with %d supporting reliable mutations." + % ( + len(list(myClip.crosslinking.keys())), + len(myClip.crosslinkingMutations), + ), + file=logfile, + ) + logfile.close() + LOGGER.info( + "PIPE-CLIP finished the job, please check your results. :)" + ) + else: + LOGGER.error("File corruputed, program exit.") + sys.exit(0) + + +if __name__ == "__main__": + runPipeClip() diff --git a/pipeclip/mutation.py b/pipeclip/mutation.py new file mode 100644 index 0000000..1cdbda8 --- /dev/null +++ b/pipeclip/mutation.py @@ -0,0 +1,347 @@ +#!/usr/bin/python +# programmer : bbc +# usage: +# output in BED format, the score is the offset of the mutation from 5' end + +import copy + +from . import Alignment, Utils + +LOGGER = Utils.get_logger(__name__) + + +def countMatchNumber(b): + myList = b + m = 0 + for i in myList: + if i[0] == 0: + m += i[1] + return m + + +def countInsertionNumber(b): + myList = b + m = 0 + for i in myList: + if i[0] == 1: + m += i[1] + return m + + +def countDeletionNumber(b): + myList = b + m = 0 + for i in myList: + if i[0] == 2: + m += i[1] + return m + + +def countMismatch(b): + myList = b + for i in myList: + if i[0] == "NM": + return i[1] + return 0 + + +def survey(entry): + # logging.debug(entry.cigar) + mismatchNumber = countMismatch(entry.tags) + insertionNumber = countInsertionNumber(entry.cigar) + deletionNumber = countDeletionNumber(entry.cigar) + substitutionNumber = max(0, mismatchNumber - deletionNumber) + # print insertionNumber,deletionNumber + return [insertionNumber, deletionNumber, substitutionNumber] + + +def SBeforeFirstM(ci): + for index in range(len(ci)): + if ci[index][0] == 0: + if index == 0: + return 0 + else: + s = 0 + for i in range(0, index): + if ci[i][0] == 4: + s += ci[i][1] + return s + + +def parseCIGAR(ci): # calculate match segment length list + matchSeg = [] + preIndex = 0 + for index in range(len(ci)): + if ci[index][0] == 0: # match tag + preIndex = index + matchSeg.append(ci[index][1]) + if index > 0: # M is not the first tag + s = 0 + for i in range( + preIndex - 1, index + ): # check all the tags before M + if ci[i][0] == 4: + s += ci[i][1] + matchSeg[-1] += s + # seach for the insertion behind (before the next match) + for insertionIndex in range(index + 1, len(ci)): + inse = 0 + if ci[insertionIndex][0] == 0: + break + elif ci[insertionIndex][0] == 1: # insertion exists + inse += ci[insertionIndex][1] + matchSeg[-1] += inse + return matchSeg + + +def parseMD(b): + buf = [] + myList = b + for j in myList: + if j[0] == "MD": + md = copy.deepcopy(j[1]) + num = ( + md.replace("A", "^") + .replace("G", "^") + .replace("T", "^") + .replace("C", "^") + .replace("^^", "^") + .split("^") + ) + for c in range(num.count("")): + num.remove("") + buf = [num[0]] + + counter = 1 + afterAlpha = 0 + for i in j[ + 1 + ]: # walk thought MD string to record mutation and deletion + if i.isalpha() or i == "^": + buf.append(i) + afterAlpha = 1 + else: + if afterAlpha and counter <= len(num) - 1: + buf.append(num[counter]) + afterAlpha = 0 + counter += 1 + break + return buf + + +def insertionLocation(entry, num): # sam entry and insertion total number + insertionLoc = {} # key:ref_offset; value:list of seq_offset + ref_offset = 0 + seq_offset = [0] + preInsertion = 0 # used to check if it is the last insertion tag + for i in entry.cigar: + if i[0] == 0 or i[0] == 4: # match or soft clip + ref_offset += i[1] + seq_offset[-1] += i[1] + elif i[0] == 2: # deletion counts on ref but not on read seq + ref_offset += i[1] + elif ( + i[0] == 1 + ): # record this insertion tag and prepare for the next one + preInsertion += i[1] + for ins in range(1, i[1]): + newloc = seq_offset[-1] + 1 + seq_offset.append(newloc) + st = [] + for item in seq_offset: + st.append(item) + insertionLoc[ref_offset] = st + if preInsertion == num: # this is the last insertion tag + return insertionLoc + else: # insertion only count on read seq + seq_index = seq_offset[0] + i[1] + + +def countInsertionBefore(seqLoc, insertLocList): + if len(insertLocList) == 0: + return 0 + else: + ins = 0 + for i in insertLocList: + if i > seqLoc: + return ins + else: + ins += 1 + return ins + + +def mutationLocation(entry, insertLoc): # return mutation location in + mutations = [] + match = entry + myList = match.tags + S_count = SBeforeFirstM( + match.cigar + ) # hard clip will cut the seq, doesn't count + mis = countMismatch(myList) + mdlist = parseMD(myList) + mdtag = "".join(mdlist) + counter = 0 + if not mdtag.isdigit(): # insertions may exist + st_seq = 0 + st_genome = 0 + offset = 0 + pre = ":" + set_insertion = 0 # recored insertion numbers already counted + for ch in mdlist: # i[1] is the MD tag + if ch.isdigit(): + st_seq += int(ch) + st_genome += int(ch) + pre = ch + elif ch == "^": + st_genome += 1 + pre = ch + elif ch.isalpha(): + if (not pre == "^") and (pre.isdigit()): + origin = ch + index = st_seq + S_count + offset + insertionBefore = countInsertionBefore(index, insertLoc) + loc = ( + st_genome + match.pos + offset + ) # -insertionBefore #0-based + index += ( + insertionBefore - set_insertion + ) # add on 9 Oct,modify 9,9 2014 + set_insertion = insertionBefore + + try: + mu = match.seq[index] + except IndexError: + mu = "N" + if mu.upper() != "N": + offset = index - S_count + 1 + st_seq = 0 + st_genome = 0 + if match.is_reverse: + chr = "-" + t = RC([origin, mu]) + origin = t[0] + mu = t[1] + else: + chr = "+" + mutation = [ + str(loc), + str(loc + 1), + match.qname, + 1, + chr, + origin + "->" + mu, + ] # change offset into mutation count, which is 1 + yield mutation + else: # this is a deletion + if pre in ["A", "G", "T", "C"]: + st_genome += 1 + loc = st_genome + match.pos + offset - 1 # 0-based + if match.is_reverse: + strand = "-" + ch = RC([ch])[0] + else: + strand = "+" + index1 = loc - match.pos + insertionBefore = countInsertionBefore(index1, insertLoc) + index1 += insertionBefore - set_insertion # added 9 Oct + set_insertion = insertionBefore + mutation = [ + str(loc), + str(loc + 1), + match.qname, + 1, + strand, + "Deletion->" + ch, + ] # change str(index1) to 1 + yield mutation + pre = ch + + return + + +def RC(strList): + rc = [] + for item in strList: + st = "" + for chIndex in range(len(item)): + rcIndex = len(item) - 1 + if item[rcIndex].upper() == "A": + st += "T" + elif item[rcIndex].upper() == "C": + st += "G" + elif item[rcIndex].upper() == "T": + st += "A" + elif item[rcIndex].upper() == "G": + st += "C" + else: + st += "N" + rc.append(st) + return rc + + +def getMutations(infile, read): + # tmp = pysam.Samfile("demo.bam",'wb',template=infile) + # logging.debug("call getMutation function %s" % read) + mutationList = [] + b = read.tags + + sur = survey(read) + insertion = sur[0] + deletion = sur[1] + substi = sur[2] + # logging.debug("insertio:,%d, deletion:%d,sub:%d" % (insertion,deletion,substi)) + insertionSeqLoc = [] + if insertion > 0: + insertionDic = insertionLocation(read, insertion) + for k in list(insertionDic.keys()): + for loc_index in range(len(insertionDic[k])): + insertionSeqLoc.append(insertionDic[k][loc_index]) + mu = read.seq[insertionDic[k][loc_index]] + loc = k + loc_index + read.pos + if read.tid >= 0: + chr = infile.getrname(read.tid) + if read.is_reverse: + strand = "-" + mu = RC([mu])[0] + else: + strand = "+" + mutationList.append( + Alignment.MutationBed( + chr, + loc, + loc + 1, + read.qname, + 1, + strand, + "Insertion->" + mu, + ) + ) # change insertionDic[k][loc_index] to 1 + insertionSeqLoc.sort() + if deletion + substi > 0: + for mu in mutationLocation(read, insertionSeqLoc): + if read.tid >= 0: + chr = infile.getrname(read.tid) + newMu = Alignment.MutationBed( + chr, mu[0], mu[1], mu[2], mu[3], mu[4], mu[5] + ) + mutationList.append(newMu) + # logging.debug(mutationList) + return mutationList + + +def getTruncations(infile, read): + """One read only have one truncation, but in order to be consistent with + muations, return a list.""" + if read.is_reverse: + strand = "-" + start = read.pos + read.alen + else: + strand = "+" + start = read.pos + stop = start + 1 + if read.tid >= 0: + chr = infile.getrname(read.tid) + newTr = Alignment.MutationBed( + chr, start, stop, read.qname, 1, strand, "truncation" + ) + return [newTr] diff --git a/lib/runR1.sh b/pipeclip/runR1.sh old mode 100755 new mode 100644 similarity index 60% rename from lib/runR1.sh rename to pipeclip/runR1.sh index 7d5a115..be13df2 --- a/lib/runR1.sh +++ b/pipeclip/runR1.sh @@ -1,4 +1,5 @@ #!/bin/sh + filename=$1 pvalue=$2 #declare -a epsilon @@ -35,21 +36,21 @@ steps=(0.1 0.08 0.05) #Call R function r_status="1" count=1 -for e in "${epsilon[@]}" -do - for s in "${steps[@]}" - do - echo "$e,$s" - if [ -s "$filename.Converge.txt" ] - then - echo - else - #echo "$e,$s" - Rscript lib/ZTNB_tryCatch.R $filename $pvalue $e $s - fi - #echo "$filename.$count" - count=$((count+1)) - done -done - +SCRIPTPATH="$( + cd -- "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" +for e in "${epsilon[@]}"; do + for s in "${steps[@]}"; do + echo "$e,$s" + if [ -s "$filename.Converge.txt" ]; then + echo + else + #echo "$e,$s" + Rscript ${SCRIPTPATH}/ZTNB_tryCatch.R $filename $pvalue $e $s + fi + #echo "$filename.$count" + count=$((count + 1)) + done +done diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..de8f360 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,321 @@ +[[package]] +name = "cffi" +version = "1.15.0" +description = "Foreign Function Interface for Python calling C code." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "click" +version = "8.1.2" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "jinja2" +version = "3.1.1" +description = "A very fast and expressive template engine." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "markupsafe" +version = "2.1.1" +description = "Safely add untrusted strings to HTML/XML markup." +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "pybedtools" +version = "0.9.0" +description = "Wrapper around BEDTools for bioinformatics work" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pysam = "*" +six = "*" + +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pysam" +version = "0.19.0" +description = "pysam" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pytz" +version = "2022.1" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "pytz-deprecation-shim" +version = "0.1.0.post0" +description = "Shims to make deprecation of pytz easier" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" + +[package.dependencies] +tzdata = {version = "*", markers = "python_version >= \"3.6\""} + +[[package]] +name = "rpy2" +version = "3.5.1" +description = "Python interface to the R language (embedded R)" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +cffi = ">=1.10.0" +jinja2 = "*" +pytz = "*" +tzlocal = "*" + +[package.extras] +all = ["pytest", "setuptools", "numpy", "pandas"] +numpy = ["pandas"] +pandas = ["numpy", "pandas"] +setup = ["setuptools"] +test = ["pytest"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "tzdata" +version = "2022.1" +description = "Provider of IANA time zone data" +category = "main" +optional = false +python-versions = ">=2" + +[[package]] +name = "tzlocal" +version = "4.2" +description = "tzinfo object for the local timezone" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytz-deprecation-shim = "*" +tzdata = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +devenv = ["black", "pyroma", "pytest-cov", "zest.releaser"] +test = ["pytest-mock (>=3.3)", "pytest (>=4.3)"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.9" +content-hash = "22b48ea4b7dda9219b9a96f39e66f62dcd11cff3132b9fa698d5c69445fa761e" + +[metadata.files] +cffi = [ + {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"}, + {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"}, + {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"}, + {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"}, + {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"}, + {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"}, + {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"}, + {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"}, + {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"}, + {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"}, + {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"}, + {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"}, + {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"}, + {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"}, + {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"}, + {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"}, + {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, +] +click = [ + {file = "click-8.1.2-py3-none-any.whl", hash = "sha256:24e1a4a9ec5bf6299411369b208c1df2188d9eb8d916302fe6bf03faed227f1e"}, + {file = "click-8.1.2.tar.gz", hash = "sha256:479707fe14d9ec9a0757618b7a100a0ae4c4e236fac5b7f80ca68028141a1a72"}, +] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] +jinja2 = [ + {file = "Jinja2-3.1.1-py3-none-any.whl", hash = "sha256:539835f51a74a69f41b848a9645dbdc35b4f20a3b601e2d9a7e22947b15ff119"}, + {file = "Jinja2-3.1.1.tar.gz", hash = "sha256:640bed4bb501cbd17194b3cace1dc2126f5b619cf068a726b98192a0fde74ae9"}, +] +markupsafe = [ + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, + {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, +] +pybedtools = [ + {file = "pybedtools-0.9.0.tar.gz", hash = "sha256:9267c92cd764173449d9c31baedac0659b4eccc3d7c05e22ec378f86c0fc30a3"}, +] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] +pysam = [ + {file = "pysam-0.19.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:44de1a3af7c7eb5f404d6337f0c9c4ee88c34c2d2fee1a7896ccd8e7d2aa475a"}, + {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ceb07c6253598ec70fef6ac0c0f7ab0d299562c1a91e737adb07239afba22d6"}, + {file = "pysam-0.19.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2057f3b8cc20562fd010e7971e83ab78978f17975563a711c94bca583ce8a2d3"}, + {file = "pysam-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1a9ee6cd6dfa50973dcb51cd2245ea7d4d64d4e962d60e5e1a088f7b790e3e"}, + {file = "pysam-0.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b831170ff810bfd1242dbce4ddf8e693e95e39a4332d5903d416233d3d1be50a"}, + {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa4a5d334986d0696522246820f295cbf6c18dc1b78798f800a2d57d56207789"}, + {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ab58d7d9b140e2e8a4918fc00661aa901ba461d9bccd4b499102b0828f2d897e"}, + {file = "pysam-0.19.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1d964c29fedf55d2a5d095227d19d915c2e0e5e42b1e0bdf7fab30cd1d2a850"}, + {file = "pysam-0.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8c78f9b84f7fe69530eaccf5b7f08636b3419f0017b5050aa7013f1c59d3d654"}, + {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ada92b376717ac4c9c9924a096af9186035115d29a113eaa997d1c020ce421b9"}, + {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:80b8f5b929c7f660b6e1497790a13c61f386b310a31ca54ad6ba110674d11c55"}, + {file = "pysam-0.19.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30017bed8d002d41f83fa7e10569525c811a8e3860d73a880ee653ef29fd4fbc"}, + {file = "pysam-0.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5e2e465522ba2c34cb96c013a28f9c9db303f8ab1350b4c11cca73677f1e9594"}, + {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2103b4b6d9b0cc0b4aaccf64e87a92bfdabb7dc92810cf84be35ffe78fafa002"}, + {file = "pysam-0.19.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eaf342b9c71ed83a63237df2000e3bc1f0236165d48fd7240c4c78b66f28d63b"}, + {file = "pysam-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc0021b154206edfbaa13515cb67523c76c576b7a670e72a793f2da4f03139f4"}, + {file = "pysam-0.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3101e0fcd2f6accbfa72a554a71baf83f1c096bb0f9045059b3ead35901ce128"}, + {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5118dcabac574638df43739941f0ee20cc4e9197aee4a8f10f195542a13f18e3"}, + {file = "pysam-0.19.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4fe1f1fae0e559d3412625dc7d4d08b477e80211b3fe5b267ba341a84f78b8be"}, + {file = "pysam-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2911e3760dd2b709c686f075146787b8bda35629093352c28a7e52ac00ddaffc"}, + {file = "pysam-0.19.0.tar.gz", hash = "sha256:dcc052566f9509fd93b2a2664f094a13f016fd60cdd189e05fb4eafa0c89505b"}, +] +pytz = [ + {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, + {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, +] +pytz-deprecation-shim = [ + {file = "pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl", hash = "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6"}, + {file = "pytz_deprecation_shim-0.1.0.post0.tar.gz", hash = "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"}, +] +rpy2 = [ + {file = "rpy2-3.5.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:eac176e65610a475bb375292db7fb571066d758b52ad523fe7b8f120947cbcef"}, + {file = "rpy2-3.5.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:d35e62a4948b5b620cf2b47f6d87dc84a1e9d18f163caa0b98824c228138e24f"}, + {file = "rpy2-3.5.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:782d2d2a1a0ef2f4b58bd9e56adeab82594281eb23358d5f77a5af12d213375b"}, + {file = "rpy2-3.5.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:6b79f1e772b9ecdd287d963656e9653f7aca4b7df434d425abcde2cbe2271d19"}, + {file = "rpy2-3.5.1.tar.gz", hash = "sha256:d35717489bd0754b556202a6b990ebc6f3a1c18c9c23b3a862a46a91fc265805"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +tzdata = [ + {file = "tzdata-2022.1-py2.py3-none-any.whl", hash = "sha256:238e70234214138ed7b4e8a0fab0e5e13872edab3be586ab8198c407620e2ab9"}, + {file = "tzdata-2022.1.tar.gz", hash = "sha256:8b536a8ec63dc0751342b3984193a3118f8fca2afe25752bb9b7fffd398552d3"}, +] +tzlocal = [ + {file = "tzlocal-4.2-py3-none-any.whl", hash = "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745"}, + {file = "tzlocal-4.2.tar.gz", hash = "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..35de657 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "pipeclip" +version = "2.0.10" +description = "PIPELINE FOR CLIP SEQ DATA" +readme = "README.md" +keywords = ["bioinformatics", "CLIP", "PAR-CLIP", "iCLIP", "RNA-protein interaction", "mutation", "truncation"] +authors = ["Zhiqun Xie"] +repository = "https://github.com/y9c/PIPE-CLIP" + +[tool.poetry.dependencies] +python = "^3.6" +pysam = "^0.19.0" +pybedtools = "^0.9.0" +click = "^8.1.2" +rpy2 = "^3.5.1" +colorama = "^0.4.4" + +[tool.poetry.dev-dependencies] + +[tool.poetry.scripts] +pipeclip = 'pipeclip.main:runPipeClip' + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/test/resources/foo.txt b/test/resources/foo.txt deleted file mode 100644 index 0527e6b..0000000 --- a/test/resources/foo.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test diff --git a/test/src/foo.txt b/test/src/foo.txt deleted file mode 100644 index 190a180..0000000 --- a/test/src/foo.txt +++ /dev/null @@ -1 +0,0 @@ -123