-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathConvertSubToUTF8.py
More file actions
113 lines (104 loc) · 3.38 KB
/
ConvertSubToUTF8.py
File metadata and controls
113 lines (104 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#coding:utf-8
#Read subtitles with '.srt' . And change the format of the file and
#leave the subtitle only with the content in a new file
#Date:2015.11.22
#By YE Zhe
import os.path
import chardet
import argparse
import os
# Read all .srt files in 'p' directory and put all of it into list.
def ReadSubFiles(p):
print p
result=[]
list_dirs=os.walk(p)
srt_files_path=[]
for root,dirs,files in list_dirs:
for f in files:
print f
if os.path.splitext(f)[1]=='.srt':
srt_files_path.append(os.path.join(root,f))
for f in srt_files_path:
print f
input_f=open(f)
code=chardet.detect(input_f.read())
input_f.close()
print code
input_f=open(f)
line=input_f.readline()
while line!="":
try:
if code['encoding'].find('utf') ==-1 and code['encoding'].find('UTF')==-1:
line=line.decode(code['encoding'])
if line.find('-->')!=-1:
line=input_f.readline().strip()
if code['encoding'].find('utf') ==-1 and code['encoding'].find('UTF')==-1:
line=line.decode(code['encoding'])
print line
if line!="":
result.append(line)
except BaseException,e:
print e
print "There is something wrong with decode."
line=input_f.readline()
input_f.close()
print f
return result
def ConvertToUTF8(str):
result=str
if isinstance(str,unicode)==False:
try:
result=str.decode('gb2312')
except BaseException,e:
print 'not gb2312'
result=str.decode('GBK')
finally:
try:
result=result.encode('UTF-8')
except BaseException,e:
print e
return ""
return result
def newConvertToUTF8(line,code,line_no=0):
result=line
try:
line=line.decode(code['encoding'])
except BaseException,e:
print "There is something wrong with decode at line %s" % (str(line_no))
print e
finally:
try:
result=line.encode('UTF-8')
return result
except BaseException,e:
print "There is something wrong with encode at line %s" % (str(line_no))
print e
return ""
if __name__=='__main__':
parser=argparse.ArgumentParser()
parser.add_argument("-i","--input",help="the subs need convert")
parser.add_argument("-o","--output",help="the output file,default is +_utf8",default="")
parser.add_argument("-r","--replace",action="store_true",help="this flag will cause the file replace,it's convinient but risky")
args=parser.parse_args()
if args.replace:
file_name=args.input
else:
if args.output=="":
file_name="utf8_"+args.input
tmp_output_file_name='.tmp'+file_name
output_file=open(tmp_output_file_name,'w')
# detect code format
f=open(args.input,'r')
code=chardet.detect(f.read())
f.close()
f=open(args.input,'r')
line=f.readline()
line_no=1
while line!="":
line=newConvertToUTF8(line,code,line_no=line_no)
output_file.write(line)
line=f.readline()
line_no+=1
f.close()
output_file.close()
os.rename(tmp_output_file_name,file_name)