diff --git a/bin/mid-cluster.py b/bin/mid-cluster.py index e27c4fe..11d9a97 100755 --- a/bin/mid-cluster.py +++ b/bin/mid-cluster.py @@ -60,7 +60,7 @@ help=( "The approach to use when making a consensus if there are no reads " "covering a site. A value of 'N' means to use an ambigous N nucleotide " - "code, whereas a value fo 'reference' means to take the base from the " + "code, whereas a value of 'reference' means to take the base from the " "reference sequence." ), ) @@ -71,6 +71,27 @@ list(chain.from_iterable(args.referenceId)) if args.referenceId else None ) + # The logic of the below is a little hard to follow. There are three steps: + # + # 1. Make a read analysis with a 'run' method (which we don't call yet). + # + # 2. Set up a Cluster Analysis, giving it the read analysis (so the running cluster + # analysis can get some parameters and produce reporting information when the + # read analysis is run. + # + # 3. Call the read analysis 'run' method (see 1 above), passing it the function from + # the cluster analysis that can analyze a reference. + # + # Things are done in this (seemingly?) convoluted way because I implemented several + # methods for doing the main work, of which the cluster analysis is just one. They + # all needed to work in the same way, and so I separated the organizational things + # (like making output directories and collecting final / overall results) into the + # common read analysis class, and the code that does the actual analysis for an + # input alignment file / reference pair. If you look at the 'run' method of the + # ReadAnalysis class you'll see it's basically just a loop over alignment file / + # reference pairs, each with some setup and then calling the cluster analysis + # function. Then some final gathering of overall results. + analysis = ReadAnalysis( args.sampleName, list(chain.from_iterable(args.alignmentFile)), diff --git a/src/midtools/analysis.py b/src/midtools/analysis.py index b390c9c..df0cf69 100644 --- a/src/midtools/analysis.py +++ b/src/midtools/analysis.py @@ -3,7 +3,7 @@ from pathlib import Path from itertools import chain from collections import defaultdict -from typing import Optional, Callable +from typing import Callable from dark.dna import compareDNAReads from dark.process import Executor @@ -71,7 +71,7 @@ def __init__( alignmentFiles: list[str], referenceGenomeFiles: list[str], outputDir: str, - referenceIds: Optional[list[str]] = None, + referenceIds: list[str] | None = None, minReads: int = DEFAULT_MIN_READS, homogeneousCutoff: float = DEFAULT_HOMOGENEOUS_CUTOFF, plotSAM: bool = False, diff --git a/src/midtools/clusterAnalysis.py b/src/midtools/clusterAnalysis.py index 0b7e822..9539793 100644 --- a/src/midtools/clusterAnalysis.py +++ b/src/midtools/clusterAnalysis.py @@ -13,7 +13,7 @@ from midtools.analysis import ReadAnalysis from midtools.clusters import ReadCluster, ReadClusters from midtools.match import matchToString -from midtools.offsets import analyzeOffets, findSignificantOffsets, OffsetBases +from midtools.offsets import analyzeOffsets, findSignificantOffsets, OffsetBases from midtools.plotting import plotBaseFrequencies, plotConsistentComponents from midtools.read import AlignedRead from midtools.reference import Reference @@ -349,9 +349,10 @@ def saveConsensuses( " Saving component %d consensus info to %s" % (count, consensusFilename, count, infoFilename) ) - with open(consensusFilename, "w") as consensusFp, open( - infoFilename, "w" - ) as infoFp: + with ( + open(consensusFilename, "w") as consensusFp, + open(infoFilename, "w") as infoFp, + ): # First write the reference sequence for this component. (reference,) = list( FastaReads(outputDir / ("reference-component-%d.fasta" % count)) @@ -481,7 +482,7 @@ def __init__( def analyzeReference(self, reference: Reference) -> None: """ - Analyze a reference. + Analyze a reference. This is called by the read analysis. @param reference: A C{Reference} instance to analyze. """ @@ -517,10 +518,9 @@ def findConnectedComponents(self, reference: Reference) -> list[Component]: Find all connected components. @param reference: A C{Reference} instance to analyze. - @return: A C{list} of C{Component} instances, - sorted by component (the smallest offset is used for sorting - so this gives the components from left to right along the - reference genome. + @return: A C{list} of C{Component} instances, sorted by component. The smallest + offset is used for sorting, which arranges the components from left to right + along the reference genome. """ significantReads = set( read for read in reference.alignedReads if read.significantOffsets @@ -696,7 +696,7 @@ def scoreCcs( return result def partitionCcs( - scoredCcs: list[tuple[float, int, ConsistentComponent]] + scoredCcs: list[tuple[float, int, ConsistentComponent]], ) -> tuple[ set[tuple[int, ConsistentComponent]], set[tuple[int, ConsistentComponent]] ]: @@ -809,8 +809,10 @@ def partitionCcs( # Get the base counts at each offset, from the full set of reads minus those # we're not using. - (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = analyzeOffets( - referenceLength, set(reference.alignedReads) - unwantedReads + (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = ( + analyzeOffsets( + referenceLength, set(reference.alignedReads) - unwantedReads + ) ) remainingOffsets = sorted(set(range(referenceLength)) - offsetsDone) diff --git a/src/midtools/connectedComponentAnalysis.py b/src/midtools/connectedComponentAnalysis.py index 79213c6..04272d8 100644 --- a/src/midtools/connectedComponentAnalysis.py +++ b/src/midtools/connectedComponentAnalysis.py @@ -9,7 +9,7 @@ from dark.sam import samfile from midtools.analysis import ReadAnalysis -from midtools.offsets import analyzeOffets, findSignificantOffsets +from midtools.offsets import analyzeOffsets, findSignificantOffsets from midtools.match import matchToString from midtools.plotting import plotBaseFrequencies, plotConsistentComponents from midtools.utils import ( @@ -107,8 +107,7 @@ def consensusSequence(self, componentOffsets, referenceSequence, infoFp): self.nucleotides[offset], referenceBase, infoFp, - "WARNING: consensus draw at offset %d" % offset - + " %(baseCounts)s.", + f"WARNING: consensus draw at offset {offset} %(baseCounts)s.", ) else: base = "-" @@ -323,9 +322,10 @@ def saveConsensuses(self, outputDir, count, referenceSequence, verbose): " Saving component %d consensus info to %s" % (count, consensusFilename, count, infoFilename) ) - with open(consensusFilename, "w") as consensusFp, open( - infoFilename, "w" - ) as infoFp: + with ( + open(consensusFilename, "w") as consensusFp, + open(infoFilename, "w") as infoFp, + ): # First write the reference sequence for this component. (reference,) = list( FastaReads(join(outputDir, "reference-component-%d.fasta" % count)) @@ -804,8 +804,8 @@ def bestConsistentComponent(component, reference, fp): bestCc.nucleotides[offset], referenceBase, infoFp, - (" WARNING: base count draw at offset %d " % offset) - + " %(baseCounts)s.", + f" WARNING: base count draw at offset {offset} " + "%(baseCounts)s.", ) consensus[offset] = base offsetsDone.add(offset) @@ -864,7 +864,7 @@ def bestConsistentComponent(component, reference, fp): consensusReadCountAtOffset, consensusWantedReadsBaseCountAtOffset, _, - ) = analyzeOffets(genomeLength, set(alignedReads) - unwantedCcReads) + ) = analyzeOffsets(genomeLength, set(alignedReads) - unwantedCcReads) depthFile = join(outputDir, "consensus-depth.txt") self.report(" Writing consensus depth information to", depthFile) @@ -891,11 +891,8 @@ def bestConsistentComponent(component, reference, fp): baseCount, referenceBase, infoFp, - ( - " WARNING: consensus base count draw at " - "offset %d" % offset - ) - + " %(baseCounts)s.", + f" WARNING: consensus base count draw at offset {offset} " + "%(baseCounts)s.", ) print( " Offset %d: %s from nucleotides %s" @@ -934,11 +931,8 @@ def bestConsistentComponent(component, reference, fp): baseCount, referenceBase, infoFp, - ( - " WARNING: consensus base count draw at " - "offset %d" % offset - ) - + " %(baseCounts)s.", + f" WARNING: consensus base count draw at offset {offset} " + "%(baseCounts)s.", ) print( " Offset %d: %s from nucleotides %s" @@ -1258,8 +1252,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp): bestCc.nucleotides[offset], referenceBase, infoFp, - (" WARNING: base count draw at offset %d " % offset) - + " %(baseCounts)s.", + f" WARNING: base count draw at offset {offset} " + "%(baseCounts)s.", ) if base == referenceBase: mismatch = "" @@ -1268,11 +1262,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp): baseCountAtOffset[offset], referenceBase, infoFp, - ( - " WARNING: consensus base count draw at " - "offset %d " % offset - ) - + " %(baseCounts)s.", + f" WARNING: consensus base count draw at offset {offset} " + "%(baseCounts)s.", ) mismatch = ( " (mismatch: reference has %s, all-read " @@ -1313,7 +1304,7 @@ def bestConsistentComponent(component, referenceCcIndex, fp): # aligned reads minus the reads we don't want because they're # in a consistent component that is not the best for this # non-reference sequence. - consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffets( + consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffsets( genomeLength, set(alignedReads) - unwantedCcReads ) @@ -1344,11 +1335,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp): baseCount, referenceBase, infoFp, - ( - " WARNING: consensus base count draw at " - "offset %d" % offset - ) - + " %(baseCounts)s.", + f" WARNING: consensus base count draw at offset {offset} " + "%(baseCounts)s.", ) print( " Offset %d: %s from nucleotides %s" @@ -1387,11 +1375,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp): baseCount, referenceBase, infoFp, - ( - " WARNING: consensus base count draw at " - "offset %d" % offset - ) - + " %(baseCounts)s.", + f" WARNING: consensus base count draw at offset {offset} " + "%(baseCounts)s.", ) print( " Offset %d: %s from nucleotides %s" diff --git a/src/midtools/offsets.py b/src/midtools/offsets.py index f4cff7a..120cd14 100644 --- a/src/midtools/offsets.py +++ b/src/midtools/offsets.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections import Counter -from concurrent.futures import ProcessPoolExecutor from typing import Iterator, Optional from midtools.read import AlignedRead @@ -161,48 +160,36 @@ def highestFrequenciesMultiple(a: OffsetBases, b: OffsetBases) -> Optional[float return orderedCounts[0][1] / orderedCounts[1][1] -def analyzeOffets( +def analyzeOffsets( genomeLength: int, alignedReads: list[AlignedRead] ) -> tuple[list[int], list[Counter[str]], list[set[AlignedRead]]]: """ Analyze the aligned reads. - @param genomeLength: The C{int} length of the genome the reads were - aligned to. + @param genomeLength: The C{int} length of the genome the reads were aligned to. @param alignedReads: A C{list} of C{AlignedRead} instances. @return: A tuple of C{list}s (readCountAtOffset, baseCountAtOffset, readsAtOffset), each indexed from zero to the genome length. """ - readCountAtOffset = dict() - baseCountAtOffset = dict() - readsAtOffset = dict() - - offsets = list(range(genomeLength)) - with ProcessPoolExecutor() as executor: - for offset, (reads, counts) in zip( - offsets, executor.map(processOffset, alignedReads, offsets) - ): - baseCountAtOffset[offset] = counts - readCountAtOffset[offset] = sum(counts.values()) - readsAtOffset[offset] = reads - - return ( - [x for _, x in sorted(readCountAtOffset.items())], - [x for _, x in sorted(baseCountAtOffset.items())], - [x for _, x in sorted(readsAtOffset.items())], - ) + readCountAtOffset = [] + baseCountAtOffset = [] + readsAtOffset = [] - -def processOffset(alignedReads, offset): nucleotides = set("ACGT") - reads = set() - counts: Counter[str] = Counter({n: 0 for n in nucleotides}) - for read in alignedReads: - base = read.base(offset) - if base in nucleotides: - counts[base] += 1 - reads.add(read) - return reads, counts + + for offset in range(genomeLength): + reads = set() + counts: Counter[str] = Counter() + for read in alignedReads: + base = read.base(offset) + if base in nucleotides: + counts[base] += 1 + reads.add(read) + baseCountAtOffset.append(counts) + readCountAtOffset.append(sum(counts.values())) + readsAtOffset.append(reads) + + return readCountAtOffset, baseCountAtOffset, readsAtOffset def findSignificantOffsets( diff --git a/src/midtools/options.py b/src/midtools/options.py index 12414c6..66fac95 100644 --- a/src/midtools/options.py +++ b/src/midtools/options.py @@ -5,7 +5,7 @@ from dark.sam import SAMFilter, PaddedSAM -from midtools.offsets import analyzeOffets, findSignificantOffsets +from midtools.offsets import analyzeOffsets, findSignificantOffsets from midtools.read import AlignedRead @@ -120,7 +120,7 @@ def parseCommandLineOptions( for alignedRead in executor.map(constructRead, queries): alignedReads.append(alignedRead) - readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffets( + readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffsets( genomeLength, alignedReads ) @@ -227,5 +227,5 @@ def addAnalysisCommandLineOptions(parser: ArgumentParser) -> None: "--verbose", type=int, default=0, - help=("The integer verbosity level (0 = no output, 1 = some output, etc)."), + help="The integer verbosity level (0 = no output, 1 = some output, etc).", ) diff --git a/src/midtools/read.py b/src/midtools/read.py index fcf4cf6..b34f56b 100644 --- a/src/midtools/read.py +++ b/src/midtools/read.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - from dark.reads import Read @@ -16,7 +14,7 @@ class AlignedRead(Read): """ def __init__( - self, id_: str, sequence: str, alignment: Optional[dict[str, str | bool]] = None + self, id_: str, sequence: str, alignment: dict[str, str | bool] | None = None ) -> None: self.significantOffsets: dict[int, str] = {} self._originalLength = len(sequence) @@ -119,7 +117,7 @@ def setSignificantOffsets(self, significantOffsets: list[int]) -> None: newSignificantOffsets[offset] = base self.significantOffsets = newSignificantOffsets - def base(self, n: int) -> Optional[str]: + def base(self, n: int) -> str | None: """ Get the nucleotide base at a given offset. diff --git a/src/midtools/reference.py b/src/midtools/reference.py index faabf32..accd449 100644 --- a/src/midtools/reference.py +++ b/src/midtools/reference.py @@ -14,10 +14,12 @@ if TYPE_CHECKING: from midtools.analysis import ReadAnalysis -from midtools.offsets import analyzeOffets, findSignificantOffsets -from midtools.plotting import plotSAM -from midtools.plotting import plotCoverageAndSignificantLocations -from midtools.plotting import plotBaseFrequencies +from midtools.offsets import analyzeOffsets, findSignificantOffsets +from midtools.plotting import ( + plotSAM, + plotCoverageAndSignificantLocations, + plotBaseFrequencies, +) from midtools.read import AlignedRead from midtools.utils import baseCountsToStr, commas, quoted, s, alignmentQuality @@ -236,10 +238,7 @@ def _initialAnalysis(self) -> None: f"{str(filename)!r}" ) plotSAM( - SAMFilter( - self.alignmentFile, - referenceIds={self.id}, - ), + SAMFilter(self.alignmentFile, referenceIds={self.id}), filename, title=f"Mapping {self.readAnalysis.sampleName} reads against {self.id}", jitter=0.45, @@ -249,7 +248,7 @@ def _initialAnalysis(self) -> None: self.readCountAtOffset, self.baseCountAtOffset, self.readsAtOffset, - ) = analyzeOffets(len(self.read), self.alignedReads) + ) = analyzeOffsets(len(self.read), self.alignedReads) self.significantOffsets = list( findSignificantOffsets(