UnsupervisedPeakCaller/preprocessing.bash at main · WhenGryphonsFly/UnsupervisedPeakCaller · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash

helpFunction()
{
   echo ""
   echo "Usage: $0 -p \"program directory\" -i \"input directory\" -o \"output directory\" -g hg -c 2 -m \"merged.bam\" -b \"indi1.bam indi2.bam\" -t 12 -n test -L 1000"
   echo -e "\t-p Absolute directory of where the program is installed at."
   echo -e "\t-i Absolute directory of input files."
   echo -e "\t-o Absolute directory of output files."
   echo -e "\t-g Genome that the data is aligned to. Currently support mm10 (Ensembl), hg38 (Ensembl), danRer11 (Ensembl)."
   echo -e "\t-c Cutoff for prefiltering. Either \"median\" or specific number."
   echo -e "\t-m Bam files merged from individual replicate. Only used for preprocessing purpose, not for calling peaks. Must be indexed and sorted."
   echo -e "\t-b Individual bam files of every replicates. Must be indexed and sorted."
   echo -e "\t-t Number of threads to use."
   echo -e "\t-n File name prefix."
   echo -e "\t-L Length of input segments."

   exit 1 # Exit script after printing help
}


while getopts ":p:i:o:g:c:m:b:t:n:L:" opt
do
    case "$opt" in
        p) pdir="$OPTARG";;
        i) indir="$OPTARG";;
        o) outdir="$OPTARG";;
        g) genome="$OPTARG";;
        c) cutoff="$OPTARG";;
        m) mergedBam="$OPTARG";;
        b) indivBam="$OPTARG";;
        t) thread="$OPTARG";;
        n) prefix="$OPTARG";;
        L) inputLn="$OPTARG";;
	s) skip="$OPTARG";;
        ?) helpFunction ;; # Print helpFunction in case parameter is non-existent
    esac
done

# Print helpFunction in case parameters are empty
if [ -z "$pdir" ] || [ -z "$indir" ] || [ -z "$outdir" ] || [ -z "$genome" ]  || [ -z "$cutoff" ]  || [ -z "$mergedBam" ] || [ -z "$indivBam" ] || [ -z "$thread" ] || [ -z "$prefix" ] || [ -z "$inputLn" ] #|| [ -z "$skip" ]
then
   echo "Some or all of the parameters are empty";
   helpFunction
fi


#step 1 - get necessary count files
#step 1.0: define chromosome numbers
chrList=`samtools view -H "$indir"/"$mergedBam" | grep "SN:" | cut -f 2 | sed 's/SN://g'`


#step 1.1: get bga genomecov of the data
for i in $chrList
do
	samtools view -hb $indir/$mergedBam $i -@ $thread | bedtools genomecov -ibam - -pc -bga | grep -w "^$i" > $outdir/"$i"_"$prefix"_merged-bga.begGraph
done

for i in $indivBam
do
	o=`echo $i | sed 's/.bam//g'`
	bedtools genomecov -ibam "$indir"/"$i" -pc -bga > "$outdir"/"$prefix"_"$o"-bga.begGraph
done
#fi

#step 1.2: get counts based on the cutoff
if [[ $cutoff == "median" ]]
then
  for i in $indivBam
	do
        	o=`echo $i | sed 's/.bam//g'`
		bash "$pdir"/getMedian.bash "$chrList" "$outdir"/"$prefix"_"$o"-bga.begGraph "$outdir"/"$prefix"_median.txt
	done

  for i in $chrList
	do
		c=`grep -w "^$i" "$outdir"/"$prefix"_median.txt | sort -k2,2n | cut -f 2 | head -n 1`
		mkdir $outdir/chr"$i"
		echo $indivBam | sed 's/ /\n/g' | parallel "bash \"$pdir\"/getAboveThreshold.bash \"$indir\"/{} \"$i\" \"$c\" \"$outdir\"/chr\"$i\"/\"$i\"_{}-temp.txt \"$thread\""
	done
else
  c="$cutoff"
  for i in $chrList
        do
                mkdir $outdir/chr"$i"
                echo $indivBam | sed 's/ /\n/g' | parallel "bash \"$pdir\"/getAboveThreshold.bash \"$indir\"/{} \"$i\" \"$c\" \"$outdir\"/chr\"$i\"/\"$i\"_{}-temp.txt \"$thread\""
        done
fi

#step 2: get positions that pass the threshold in every replicate
for i in $chrList
do
bedops --intersect "$outdir"/chr"$i"/"$i"_* | bedtools merge -d 90 -i - | awk '{if ($3-$2 >= 100) {print}}' | awk '{$(NF+1)="segment"NR}1' | sed 's/ /\t/g' | bedtools intersect -wa -wb -a - -b "$outdir"/"$i"_"$prefix"_merged-bga.begGraph > "$outdir"/chr"$i"/temp"$i".txt
done

#step 3: get $inputLn bp regions
arr=($chrList)
m=${#arr[@]}
for((n=0;n<${#arr[@]};n++)); do
        if (( $(($n % 3 )) == 0 & $(($n < ($m-2) )))); then
                # Run every 3 entries
                echo ${arr[n]} ${arr[n+1]} ${arr[n+2]} | sed 's/ /\n/g' | parallel "Rscript --vanilla \"$pdir\"/getCountFiles.R \"$outdir\"/chr{}/temp{}.txt \"$inputLn\" \"$outdir\"/chr{}/{}regions.txt"
        fi
done

if [[ $(($m % 3 )) == 1 ]]
then
	echo ${arr[$m-1]} | parallel "Rscript --vanilla \"$pdir\"/getCountFiles.R \"$outdir\"/chr{}/temp{}.txt \"$inputLn\" \"$outdir\"/chr{}/{}regions.txt"
elif [[ $(($m % 3 )) == 2 ]]
then
	echo ${arr[$m-2]} ${arr[$m-1]} | sed 's/ /\n/g' | parallel "Rscript --vanilla \"$pdir\"/getCountFiles.R \"$outdir\"/chr{}/temp{}.txt \"$inputLn\" \"$outdir\"/chr{}/{}regions.txt"
fi


#step 3.1: remove blacklist
if [[ $genome == "hg" ]]
then
  echo "Blacklist regions are Ensembl genome. This will not work if the genome used for alignment is from UCSC."
  bl=""$pdir"/hg38-blacklist.v2.ensembl.bed"
elif [[ $genome == "mm" ]]
then
  echo "Blacklist regions are Ensembl genome. This will not work if the genome used for alignment is from UCSC."
  bl=""$pdir"/mm10-blacklist.v2.ensembl.bed"
else
  echo "Only human and mouse genome blacklists are supported for now. Blacklist regions are Ensembl genome. This will not work if the genome used for alignment is from UCSC. If not mouse or human, will skip removing blacklist regions."
fi

if [[ $genome == "hg" ]] || [[ $genome == "mm" ]]
then
	for i in $chrList
	do
		tail -n +2 "$outdir"/chr"$i"/"$i"regions.txt | sed 's/chr//g' | bedtools intersect -v -a - -b "$bl" > "$outdir"/chr"$i"/"$i"regions2.txt
	done
else
        for i in $chrList
        do
                tail -n +2 "$outdir"/chr"$i"/"$i"regions.txt | sed 's/chr//g' > "$outdir"/chr"$i"/"$i"regions2.txt
        done
fi


#step 4: get counts
indivBam2=`echo $indivBam | sed 's/.bam//g'`
arr=($indivBam2)
m=${#arr[@]}

for i in $chrList; do
for((n=0;n<${#arr[@]};n++)); do
        if (( $(($n % 3 )) == 0 & $(($n < ($m-2) )))); then
                # Run every 3 entries
                echo ${arr[n]} ${arr[n+1]} ${arr[n+2]} | sed 's/ /\n/g' | parallel "bedtools intersect -wb -a \"$outdir\"/chr\"$i\"/\"$i\"regions2.txt -b \"$outdir\"/\"$prefix\"_{}-bga.begGraph | cut -f1-4,8 > \"$outdir\"/chr\"$i\"/\"$i\".\"$prefix\"_{}.covBga.txt"
        fi
done

if [[ $(($m % 3 )) == 1 ]]
then
        echo ${arr[$m-1]} | parallel "bedtools intersect -wb -a \"$outdir\"/chr\"$i\"/\"$i\"regions2.txt -b \"$outdir\"/\"$prefix\"_{}-bga.begGraph | cut -f1-4,8 > \"$outdir\"/chr\"$i\"/\"$i\".\"$prefix\"_{}.covBga.txt"
elif [[ $(($m % 3 )) == 2 ]]
then
        echo ${arr[$m-2]} ${arr[$m-1]} | sed 's/ /\n/g' | parallel "bedtools intersect -wb -a \"$outdir\"/chr\"$i\"/\"$i\"regions2.txt -b \"$outdir\"/\"$prefix\"_{}-bga.begGraph | cut -f1-4,8 > \"$outdir\"/chr\"$i\"/\"$i\".\"$prefix\"_{}.covBga.txt"
fi
done

#step 5: get big inputs
echo $chrList | sed 's/ /\n/g' > "$outdir"/chrList.txt
Rscript --vanilla "$pdir"/bigInputs.R "$outdir"/chrList.txt "$outdir"

#step 5: remove extra files
rm "$outdir"/*-bga.begGraph
rm "$outdir"/chr*/*.bam-temp.txt
rm "$outdir"/chr*/*regions.txt