-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapping.py
More file actions
51 lines (46 loc) · 1.45 KB
/
mapping.py
File metadata and controls
51 lines (46 loc) · 1.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import sys
import re
import config
#opening a pi file and extracting the doc index and making them match the file name format
def readFile(files):
fileNames = []
documentPiFile = config.getDocumentPi()
with open (documentPiFile, "r") as doc:
lines = [line.split() for line in doc.readlines()]
for line in lines:
fileNameData = []
for word in line:
if "d_" in str(word):
fileNameData.append(word)
fileNames.append(fileNameData)
for couple in fileNames:
docName = couple[1]
fileName = re.sub(r'[<d_>\s]', '', str(docName))
docIndex = couple[0]
fileIndex = re.sub(r'd_','', str(docIndex))
fileTuple = (fileIndex, fileName)
files.append(fileTuple)
#mapping those doc indexes to their corresponding filenames in the raw tweets
rawFiles = []
def mapToRawFiles():
path = config.rawTweetsPath()
files = []
readFile(files)
for tup in files:
fileNameStr = tup[1]+"_texts_"
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
if str(fileNameStr) in str(file_name):
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, 'r',errors='ignore')
for line in f:
lines.append(line)
f.close()
rawFileTuple = (tup[0],fileNameStr, lines)
rawFiles.append(rawFileTuple)
mapToRawFiles()