Skip to content

Commit e096abc

Browse files
committed
Normalize CRLF to LF in ParserStream
1 parent ed07e7a commit e096abc

File tree

5 files changed

+280
-65
lines changed

5 files changed

+280
-65
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
tests/syntax/fixtures_reference/crlf.ftl eol=crlf
2+
tests/syntax/fixtures_structure/crlf.ftl eol=crlf

fluent/syntax/parser.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import unicode_literals
22
import re
33
from . import ast
4-
from .stream import FluentParserStream
4+
from .stream import EOL, FluentParserStream
55
from .errors import ParseError
66

77

@@ -156,10 +156,10 @@ def get_zero_four_style_comment(self, ps):
156156
content = ''
157157

158158
while True:
159-
ch = ps.take_char(lambda x: x != '\n')
159+
ch = ps.take_char(lambda x: x != EOL)
160160
while ch:
161161
content += ch
162-
ch = ps.take_char(lambda x: x != '\n')
162+
ch = ps.take_char(lambda x: x != EOL)
163163

164164
if ps.is_next_line_zero_four_comment(skip=False):
165165
content += ps.current_char
@@ -197,12 +197,12 @@ def get_comment(self, ps):
197197
if level == -1:
198198
level = i
199199

200-
if ps.current_char != '\n':
200+
if ps.current_char != EOL:
201201
ps.expect_char(' ')
202-
ch = ps.take_char(lambda x: x != '\n')
202+
ch = ps.take_char(lambda x: x != EOL)
203203
while ch:
204204
content += ch
205-
ch = ps.take_char(lambda x: x != '\n')
205+
ch = ps.take_char(lambda x: x != EOL)
206206

207207
if ps.is_next_line_comment(skip=False, level=level):
208208
content += ps.current_char
@@ -220,7 +220,7 @@ def get_comment(self, ps):
220220
@with_span
221221
def get_group_comment_from_section(self, ps):
222222
def until_closing_bracket_or_eol(ch):
223-
return ch not in (']', '\n')
223+
return ch not in (']', EOL)
224224

225225
ps.expect_char('[')
226226
ps.expect_char('[')
@@ -421,25 +421,24 @@ def get_value(self, ps):
421421
def get_variant_list(self, ps):
422422
ps.expect_char('{')
423423
ps.skip_blank_inline()
424-
ps.expect_char('\n')
424+
ps.expect_line_end()
425425
ps.skip_blank()
426426
variants = self.get_variants(ps)
427-
ps.expect_char('\n')
427+
ps.expect_line_end()
428428
ps.skip_blank()
429429
ps.expect_char('}')
430430
return ast.VariantList(variants)
431431

432432
@with_span
433433
def get_pattern(self, ps):
434434
elements = []
435-
ps.skip_blank_inline()
436435

437436
while ps.current_char:
438437
ch = ps.current_char
439438

440439
# The end condition for get_pattern's while loop is a newline
441440
# which is not followed by a valid pattern continuation.
442-
if ch == '\n' and not ps.is_next_line_value(skip=False):
441+
if ch == EOL and not ps.is_next_line_value(skip=False):
443442
break
444443

445444
if ch == '{':
@@ -467,23 +466,23 @@ def get_text_element(self, ps):
467466
if ch == '{':
468467
return ast.TextElement(buf)
469468

470-
if ch == '\n':
469+
if ch == EOL:
471470
if not ps.is_next_line_value(skip=False):
472471
return ast.TextElement(buf)
473472

474473
ps.next()
475474
ps.skip_blank_inline()
476475

477-
# Add the new line to the buffer
478-
buf += ch
476+
buf += EOL
479477
continue
480478

481479
if ch == '\\':
482480
ps.next()
483481
buf += self.get_escape_sequence(ps)
484-
else:
485-
buf += ch
486-
ps.next()
482+
continue
483+
484+
buf += ch
485+
ps.next()
487486

488487
return ast.TextElement(buf)
489488

@@ -542,7 +541,7 @@ def get_expression(self, ps):
542541
ps.next()
543542

544543
ps.skip_blank_inline()
545-
ps.expect_char('\n')
544+
ps.expect_line_end()
546545
ps.skip_blank()
547546

548547
variants = self.get_variants(ps)
@@ -675,18 +674,18 @@ def get_string(self, ps):
675674

676675
ps.expect_char('"')
677676

678-
ch = ps.take_char(lambda x: x != '"' and x != '\n')
677+
ch = ps.take_char(lambda x: x != '"' and x != EOL)
679678
while ch:
680679
if ch == '\\':
681680
val += self.get_escape_sequence(ps, ('{', '\\', '"'))
682681
else:
683682
val += ch
684-
ch = ps.take_char(lambda x: x != '"' and x != '\n')
683+
ch = ps.take_char(lambda x: x != '"' and x != EOL)
685684

686-
if ps.current_char == '\n':
685+
if ps.current_char == EOL:
687686
raise ParseError('E0020')
688687

689-
ps.next()
688+
ps.expect_char('"')
690689

691690
return ast.StringLiteral(val)
692691

fluent/syntax/stream.py

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,23 @@ def __init__(self, string):
88
self.index = 0
99
self.peek_offset = 0
1010

11-
def char_at(self, index):
11+
def get(self, offset):
1212
try:
13-
return self.string[index]
13+
return self.string[offset]
1414
except IndexError:
1515
return None
1616

17+
def char_at(self, offset):
18+
# When the cursor is at CRLF, return LF but don't move the cursor. The
19+
# cursor still points to the EOL position, which in this case is the
20+
# beginning of the compound CRLF sequence. This ensures slices of
21+
# [inclusive, exclusive) continue to work properly.
22+
if self.get(offset) == '\r' \
23+
and self.get(offset + 1) == '\n':
24+
return '\n'
25+
26+
return self.get(offset)
27+
1728
@property
1829
def current_char(self):
1930
return self.char_at(self.index)
@@ -23,13 +34,21 @@ def current_peek(self):
2334
return self.char_at(self.index + self.peek_offset)
2435

2536
def next(self):
26-
self.index += 1
2737
self.peek_offset = 0
28-
return self.char_at(self.index)
38+
# Skip over CRLF as if it was a single character.
39+
if self.get(self.index) == '\r' \
40+
and self.get(self.index + 1) == '\n':
41+
self.index += 1
42+
self.index += 1
43+
return self.get(self.index)
2944

3045
def peek(self):
46+
# Skip over CRLF as if it was a single character.
47+
if self.get(self.index + self.peek_offset) == '\r' \
48+
and self.get(self.index + self.peek_offset + 1) == '\n':
49+
self.peek_offset += 1
3150
self.peek_offset += 1
32-
return self.char_at(self.index + self.peek_offset)
51+
return self.get(self.index + self.peek_offset)
3352

3453
def reset_peek(self, offset=0):
3554
self.peek_offset = offset
@@ -39,38 +58,28 @@ def skip_to_peek(self):
3958
self.peek_offset = 0
4059

4160

42-
INLINE_WS = ' '
43-
ANY_WS = (INLINE_WS, '\n')
61+
EOL = '\n'
62+
EOF = None
4463
SPECIAL_LINE_START_CHARS = ('}', '.', '[', '*')
4564

4665

4766
class FluentParserStream(ParserStream):
4867
last_comment_zero_four_syntax = False
4968

50-
def __init__(self, string):
51-
# Normalize line endings to LF.
52-
string = string.replace('\r\n', '\n')
53-
super(FluentParserStream, self).__init__(string)
54-
5569
def skip_blank_inline(self):
56-
while self.current_char:
57-
if self.current_char != INLINE_WS:
58-
break
70+
while self.current_char == ' ':
5971
self.next()
6072

6173
def peek_blank_inline(self):
62-
ch = self.current_peek
63-
while ch:
64-
if ch != INLINE_WS:
65-
break
66-
ch = self.peek()
74+
while self.current_peek == ' ':
75+
self.peek()
6776

6877
def skip_blank_block(self):
6978
line_count = 0
7079
while True:
7180
self.peek_blank_inline()
7281

73-
if self.current_peek == '\n':
82+
if self.current_peek == EOL:
7483
self.skip_to_peek()
7584
self.next()
7685
line_count += 1
@@ -84,46 +93,48 @@ def peek_blank_block(self):
8493

8594
self.peek_blank_inline()
8695

87-
if self.current_peek == '\n':
96+
if self.current_peek == EOL:
8897
self.peek()
8998
else:
9099
self.reset_peek(line_start)
91100
break
92101

93102
def skip_blank(self):
94-
while self.current_char in ANY_WS:
103+
while self.current_char in (" ", EOL):
95104
self.next()
96105

97106
def peek_blank(self):
98-
while self.current_peek in ANY_WS:
107+
while self.current_peek in (" ", EOL):
99108
self.peek()
100109

101110
def expect_char(self, ch):
102111
if self.current_char == ch:
103112
self.next()
104113
return True
105114

106-
if ch == '\n':
107-
# Unicode Character 'SYMBOL FOR NEWLINE' (U+2424)
108-
raise ParseError('E0003', '\u2424')
109-
110115
raise ParseError('E0003', ch)
111116

112117
def expect_line_end(self):
113-
if self.current_char is None:
118+
if self.current_char is EOF:
114119
# EOF is a valid line end in Fluent.
115120
return True
116-
return self.expect_char('\n')
121+
122+
if self.current_char == EOL:
123+
self.next()
124+
return True
125+
126+
# Unicode Character 'SYMBOL FOR NEWLINE' (U+2424)
127+
raise ParseError('E0003', '\u2424')
117128

118129
def take_char(self, f):
119130
ch = self.current_char
120-
if ch is not None and f(ch):
131+
if ch is not EOF and f(ch):
121132
self.next()
122133
return ch
123134
return None
124135

125-
def is_char_id_start(self, ch=None):
126-
if ch is None:
136+
def is_char_id_start(self, ch):
137+
if ch is EOF:
127138
return False
128139

129140
cc = ord(ch)
@@ -135,7 +146,8 @@ def is_identifier_start(self):
135146

136147
def is_number_start(self):
137148
ch = self.peek() if self.current_char == '-' else self.current_char
138-
if ch is None:
149+
if ch is EOF:
150+
self.reset_peek()
139151
return False
140152

141153
cc = ord(ch)
@@ -144,7 +156,7 @@ def is_number_start(self):
144156
return is_digit
145157

146158
def is_char_pattern_continuation(self, ch):
147-
if ch is None:
159+
if ch is EOF:
148160
return False
149161

150162
return ch not in SPECIAL_LINE_START_CHARS
@@ -157,7 +169,7 @@ def is_value_start(self, skip):
157169
ch = self.current_peek
158170

159171
# Inline Patterns may start with any char.
160-
if ch is not None and ch != '\n':
172+
if ch is not EOF and ch != EOL:
161173
self.skip_to_peek()
162174
return True
163175

@@ -167,7 +179,7 @@ def is_next_line_zero_four_comment(self, skip):
167179
if skip is True:
168180
raise NotImplementedError()
169181

170-
if self.current_peek != '\n':
182+
if self.current_peek != EOL:
171183
return False
172184

173185
is_comment = (self.peek(), self.peek()) == ('/', '/')
@@ -182,7 +194,7 @@ def is_next_line_comment(self, skip, level=-1):
182194
if skip is True:
183195
raise NotImplementedError()
184196

185-
if self.current_peek != '\n':
197+
if self.current_peek != EOL:
186198
return False
187199

188200
i = 0
@@ -195,7 +207,8 @@ def is_next_line_comment(self, skip, level=-1):
195207
break
196208
i += 1
197209

198-
if self.peek() in [' ', '\n']:
210+
# The first char after #, ## or ###.
211+
if self.peek() in (' ', EOL):
199212
self.reset_peek()
200213
return True
201214

@@ -206,7 +219,7 @@ def is_next_line_variant_start(self, skip):
206219
if skip is True:
207220
raise NotImplementedError()
208221

209-
if self.current_peek != '\n':
222+
if self.current_peek != EOL:
210223
return False
211224

212225
self.peek_blank()
@@ -235,7 +248,7 @@ def is_next_line_attribute_start(self, skip):
235248
return False
236249

237250
def is_next_line_value(self, skip):
238-
if self.current_peek != '\n':
251+
if self.current_peek != EOL:
239252
return False
240253

241254
self.peek_blank_block()
@@ -261,15 +274,15 @@ def is_next_line_value(self, skip):
261274
return True
262275

263276
def skip_to_next_entry_start(self, junk_start):
264-
last_newline = self.string.rfind('\n', 0, self.index)
277+
last_newline = self.string.rfind(EOL, 0, self.index)
265278
if junk_start < last_newline:
266279
# Last seen newline is _after_ the junk start. It's safe to rewind
267280
# without the risk of resuming at the same broken entry.
268281
self.index = last_newline
269282

270283
while self.current_char:
271284
# We're only interested in beginnings of line.
272-
if self.current_char != '\n':
285+
if self.current_char != EOL:
273286
self.next()
274287
continue
275288

0 commit comments

Comments
 (0)