-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathintergenicProfile.cpp
More file actions
executable file
·140 lines (116 loc) · 4.12 KB
/
intergenicProfile.cpp
File metadata and controls
executable file
·140 lines (116 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#include <iostream>
#include "wigreader.h"
#include <fstream>
#include <string>
#include <sstream>
#include <math.h>
#include <cstdlib>
#include "functions.h"
#include <cstring>
#include <map>
#include <algorithm>
using namespace std;
string profileOutputFile(string wig_file) {
string output_filename = file_prefix(basename(wig_file));
output_filename.append(".intergenic_profile.txt");
cout << output_filename << endl;
return output_filename;
}
void profileForRegion(BEDLINE &gene, WigReader &reader, ofstream &output_stream) {
ostringstream outbuff;
//outbuff.precision(3);
//outbuff << fixed;
outbuff << gene.gene << "\t" << gene.chr << "\t" << gene.start << "\t" << gene.end << "\t" << gene.strand;
try {
float v = reader.rpkm(gene.chr, gene.start, gene.end);
outbuff << "\t" << v;
outbuff << endl;
output_stream << outbuff.str();
}
catch(OutOfWigRangeException outofwigrangeexception) {
return;
}
}
int main(int argc, char* argv[]) {
if(argc != 4) {
cout << endl;
cout << "Usage: intergenicProfile genes.bed example.wig dist_away_from_genes" << cout;
cout << endl;
exit(1);
}
string gene_bed_file = argv[1]; //"/home/tarakhovsky/genomics/useful_bed_files/mm9.tss.2kb.bed"
string wig_file = argv[2];
string header_postpend = file_prefix(basename(wig_file));
int buffer_dist = atoi(argv[3]);
const int MIN_SIZE = 1000;
string output_file = profileOutputFile(wig_file);
vector<BEDLINE> genes;
map<string,vector<BEDLINE> > genesMapByChr;
readBedFile(gene_bed_file,genes);
bedlinesVectorToMapByChr(genes,genesMapByChr);
WigReader reader(wig_file);
reader.Read();
ofstream output_stream;
output_stream.open (output_file.c_str());
//output_stream.precision(3);
//output_stream << fixed;
output_stream << "chr\tstart\tend\trpkm" << endl;
map<string, vector<BEDLINE> >::const_iterator it1;
for(it1 = genesMapByChr.begin(); it1 != genesMapByChr.end(); ++it1){
string chr = (*it1).first;
vector<BEDLINE> genesInThisChr = (*it1).second;
/* sort the genes in this Chr by start and then end. */
sort(genesInThisChr.begin(),genesInThisChr.end());
/* Plan:
1. Find the beginning of the recorded data for this chr
2. Calculate mean(RPKM) from beginning until beginning of first gene record (minus buffer).
3. Find end of last used gene record + buffer and record as new "beginning"
*/
int currentPos = reader.startForChromosome(chr);
int endChr = reader.endForChromosome(chr);
for(int currentGeneIndex=0; currentGeneIndex < genesInThisChr.size(); currentGeneIndex +=1) {
BEDLINE g = genesInThisChr[currentGeneIndex];
/* If we are past the end of the chr */
if(currentPos >= endChr) {
break;
}
// If our currentPos is beyond our current gene, we need to hop to the next gene.
// This should not happen.
if(currentPos >= (g.end + buffer_dist)) {
currentGeneIndex += 1;
continue;
}
// If currentPos is inside the current gene+buffer_dist, we need to skip to the end of this
// gene + buffer_dist and to the next gene.
if(currentPos >= (g.start - buffer_dist)) {
currentPos = g.end + buffer_dist;
currentGeneIndex += 1;
continue;
}
// Now we know we are somewhere before the next gene's buffer_dist and after the last gene's buffer_dist
int endPos = g.start - buffer_dist; // This is where we are going until
// If the region is smaller than the MIN_SIZE, then we skip it and move to the next gene.
if((endPos - currentPos) < MIN_SIZE) {
currentPos = g.end + buffer_dist;
currentGeneIndex += 1;
continue;
}
// Finally, we do due diligance.
// Are we past the limit of the chr?
// If so, let's shorten our limit
if(endPos > endChr) {
endPos = endChr;
if(endPos < currentPos) {
cerr << "ERROR: currentPos got ahead of the end of the chr." << endl;
exit(1);
}
}
float rpkm = reader.rpkm(chr, currentPos, endPos);
// We are gt $MIN_SIZE away from the next gene's buffer_dist. Let's output the RPKM here.
output_stream << chr << "\t" << currentPos << "\t" << endPos << "\t" << rpkm << endl;
currentPos = endPos + 1;
}
}
output_stream.close();
return 0;
}