From 2c0410626fdfee9b599bbf68e54213514f7f3d89 Mon Sep 17 00:00:00 2001 From: Owen Carey <37121709+owenthcarey@users.noreply.github.com> Date: Sun, 31 May 2026 15:11:29 -0700 Subject: [PATCH 1/2] feat: replace regex shim with a faithful CPython re/_sre engine --- crates/weavepy-compiler/src/lib.rs | 21 + crates/weavepy-parser/src/parser.rs | 14 + crates/weavepy-vm/src/builtin_types.rs | 16 + crates/weavepy-vm/src/builtins.rs | 161 +- crates/weavepy-vm/src/lib.rs | 560 ++++-- crates/weavepy-vm/src/object.rs | 101 +- crates/weavepy-vm/src/stdlib/mod.rs | 56 +- crates/weavepy-vm/src/stdlib/python/enum.py | 29 +- .../src/stdlib/python/re_casefix.py | 106 ++ .../src/stdlib/python/re_compiler.py | 775 ++++++++ .../src/stdlib/python/re_constants.py | 222 +++ .../weavepy-vm/src/stdlib/python/re_engine.py | 426 +++++ .../weavepy-vm/src/stdlib/python/re_init.py | 350 ++++ .../weavepy-vm/src/stdlib/python/re_parser.py | 1081 ++++++++++++ .../src/stdlib/python/sre_compile.py | 11 + .../src/stdlib/python/sre_constants.py | 11 + .../weavepy-vm/src/stdlib/python/sre_parse.py | 11 + crates/weavepy-vm/src/stdlib/re.rs | 1089 ------------ crates/weavepy-vm/src/stdlib/sre_mod.rs | 1557 +++++++++++++++++ crates/weavepy-vm/src/stdlib/thread_real.rs | 2 + crates/weavepy-vm/src/stdlib/weakref_real.rs | 6 +- crates/weavepy-vm/src/types.rs | 20 + docs/rfcs/0035-faithful-re-sre-unicode.md | 435 +++++ tests/regrtest/test_re.py | 116 ++ 24 files changed, 5948 insertions(+), 1228 deletions(-) create mode 100644 crates/weavepy-vm/src/stdlib/python/re_casefix.py create mode 100644 crates/weavepy-vm/src/stdlib/python/re_compiler.py create mode 100644 crates/weavepy-vm/src/stdlib/python/re_constants.py create mode 100644 crates/weavepy-vm/src/stdlib/python/re_engine.py create mode 100644 crates/weavepy-vm/src/stdlib/python/re_init.py create mode 100644 crates/weavepy-vm/src/stdlib/python/re_parser.py create mode 100644 crates/weavepy-vm/src/stdlib/python/sre_compile.py create mode 100644 crates/weavepy-vm/src/stdlib/python/sre_constants.py create mode 100644 crates/weavepy-vm/src/stdlib/python/sre_parse.py delete mode 100644 crates/weavepy-vm/src/stdlib/re.rs create mode 100644 crates/weavepy-vm/src/stdlib/sre_mod.rs create mode 100644 docs/rfcs/0035-faithful-re-sre-unicode.md create mode 100644 tests/regrtest/test_re.py diff --git a/crates/weavepy-compiler/src/lib.rs b/crates/weavepy-compiler/src/lib.rs index ee7d423..6922b80 100644 --- a/crates/weavepy-compiler/src/lib.rs +++ b/crates/weavepy-compiler/src/lib.rs @@ -4337,6 +4337,27 @@ fn collect_decls( collect_decls(s, globals, nonlocals, assigned); } } + // `import a.b.c` binds the top-level package `a` (or the + // asname); `from m import x as y` binds `y`. These are real + // local bindings and must be tracked so a name captured by a + // nested scope is promoted to a cellvar (CPython parity). + StmtKind::Import(aliases) => { + for a in aliases { + let bind = a + .asname + .clone() + .unwrap_or_else(|| a.name.split('.').next().unwrap_or(&a.name).to_owned()); + assigned.insert(bind); + } + } + StmtKind::ImportFrom { names, .. } => { + for a in names { + let bind = a.asname.clone().unwrap_or_else(|| a.name.clone()); + if bind != "*" { + assigned.insert(bind); + } + } + } _ => {} } } diff --git a/crates/weavepy-parser/src/parser.rs b/crates/weavepy-parser/src/parser.rs index ab0bec6..2805699 100644 --- a/crates/weavepy-parser/src/parser.rs +++ b/crates/weavepy-parser/src/parser.rs @@ -3359,6 +3359,20 @@ fn decode_str_body(s: &str, raw: bool) -> Result { let n = u32::from_str_radix(&hex, 16).map_err(|e| e.to_string())?; out.push(char::from_u32(n).unwrap_or('\u{FFFD}')); } + 'U' => { + // 8-hex code-point escape, e.g. `\U0001F600`. Required + // for non-BMP literals; CPython rejects out-of-range or + // surrogate values, so we surface a clear error. + let mut hex = String::new(); + for _ in 0..8 { + hex.push(chars.next().ok_or("incomplete \\U escape")?); + } + let n = u32::from_str_radix(&hex, 16).map_err(|e| e.to_string())?; + let ch = char::from_u32(n).ok_or_else(|| { + format!("invalid \\U escape: {n:#x} is not a valid character") + })?; + out.push(ch); + } other => { // CPython issues a DeprecationWarning for unknown // escapes but emits both characters literally. diff --git a/crates/weavepy-vm/src/builtin_types.rs b/crates/weavepy-vm/src/builtin_types.rs index b174c47..b12dc5f 100644 --- a/crates/weavepy-vm/src/builtin_types.rs +++ b/crates/weavepy-vm/src/builtin_types.rs @@ -626,6 +626,22 @@ fn install_object_dunders(object_: &Rc) { )) } }; + // When `cls` derives from a primitive immutable built-in (so far + // `int` — covering `_NamedIntConstant`, `enum.IntEnum`/`IntFlag` + // and hand-written `class C(int)`), capture the value the + // instance wraps. `super().__new__(cls, value)` passes it as the + // second positional argument; absent that it defaults to 0. + if cls.is_subclass_of(&builtin_types().int_) { + let native = match args.get(1) { + None => Object::Int(0), + Some(o @ (Object::Int(_) | Object::Long(_))) => o.clone(), + Some(Object::Bool(b)) => Object::Int(i64::from(*b)), + Some(o) => Object::Int(o.as_i64().unwrap_or(0)), + }; + return Ok(Object::Instance(Rc::new(PyInstance::with_native( + cls, native, + )))); + } Ok(Object::Instance(Rc::new(PyInstance::new(cls)))) } fn object_init(_args: &[Object]) -> Result { diff --git a/crates/weavepy-vm/src/builtins.rs b/crates/weavepy-vm/src/builtins.rs index 3a01ebe..3752435 100644 --- a/crates/weavepy-vm/src/builtins.rs +++ b/crates/weavepy-vm/src/builtins.rs @@ -439,6 +439,8 @@ pub fn lookup_method(obj: &Object, name: &str) -> Option { "splitlines" => Some(method("splitlines", bytes_splitlines)), "join" => Some(method("join", bytes_join)), "replace" => Some(method("replace", bytes_replace)), + "translate" => Some(method("translate", bytes_translate)), + "maketrans" => Some(method("maketrans", bytes_maketrans)), "isalnum" => Some(method("isalnum", bytes_isalnum)), "isalpha" => Some(method("isalpha", bytes_isalpha)), "isdigit" => Some(method("isdigit", bytes_isdigit)), @@ -618,6 +620,35 @@ fn b_str(args: &[Object]) -> Result { if args.is_empty() { return Ok(Object::from_static("")); } + // `str(object, encoding[, errors])` decodes a bytes-like object, + // equivalent to `object.decode(encoding, errors)`. CPython's + // `re._parser.Tokenizer` relies on `str(pattern, 'latin1')` to + // tokenize bytes patterns, so this path must decode rather than + // fall back to `repr`-style stringification. + if args.len() >= 2 { + match &args[0] { + Object::Bytes(_) | Object::ByteArray(_) => {} + other => { + return Err(type_error(format!( + "decoding to str: need a bytes-like object, {} found", + other.type_name() + ))); + } + } + let data = bytes_data(args)?; + let encoding = match &args[1] { + Object::Str(e) => e.to_string(), + Object::None => "utf-8".to_owned(), + _ => return Err(type_error("str() argument 'encoding' must be str")), + }; + let errors = match args.get(2) { + Some(Object::Str(e)) => e.to_string(), + Some(Object::None) | None => "strict".to_owned(), + _ => return Err(type_error("str() argument 'errors' must be str")), + }; + let s = crate::stdlib::codecs_mod::decode_bytes(&data, &encoding, &errors)?; + return Ok(Object::from_str(s)); + } Ok(Object::from_str(args[0].to_str())) } @@ -2628,6 +2659,7 @@ pub fn make_super(class: Rc, receiver: Object) -> Obje d.insert(DictKey(Object::from_static("__self__")), receiver); d })), + native: None, }; Object::Instance(Rc::new(inst)) } @@ -3804,7 +3836,9 @@ fn str_isidentifier(args: &[Object]) -> Result { fn str_isprintable(args: &[Object]) -> Result { let s = str_self(args)?; - Ok(Object::Bool(s.chars().all(|c| !c.is_control()))) + Ok(Object::Bool( + s.chars().all(crate::object::char_is_printable), + )) } fn str_zfill(args: &[Object]) -> Result { @@ -4827,17 +4861,56 @@ fn bytes_match_prefix_suffix( } } +/// Resolve the optional `start`/`end` arguments of `bytes.find` and +/// friends (positions 2 and 3) into a clamped `[start, end]` byte +/// window, applying CPython's slice-style negative-index handling. +fn bytes_search_range(args: &[Object], len: usize) -> (usize, usize) { + let n = len as i64; + let resolve = |o: Option<&Object>, default: i64| -> i64 { + match o { + None | Some(Object::None) => default, + Some(obj) => match obj.as_i64() { + Some(mut x) => { + if x < 0 { + x += n; + } + x.clamp(0, n) + } + None => default, + }, + } + }; + let start = resolve(args.get(2), 0).clamp(0, n) as usize; + let end = resolve(args.get(3), n).clamp(0, n) as usize; + (start, end.max(start)) +} + +/// Find `sub` within `data[start..end]`, returning the *absolute* +/// position (or -1). Mirrors `bytes.find`'s empty-needle behaviour. +fn bytes_find_in(data: &[u8], sub: &[u8], start: usize, end: usize) -> i64 { + if start > end || end > data.len() { + return -1; + } + let hay = &data[start..end]; + if sub.is_empty() { + return start as i64; + } + if sub.len() > hay.len() { + return -1; + } + hay.windows(sub.len()) + .position(|w| w == sub) + .map_or(-1, |i| (start + i) as i64) +} + fn bytes_find(args: &[Object]) -> Result { let data = bytes_data(args)?; let sub = bytes_argview( args.get(1) .ok_or_else(|| type_error("find() expected 1 arg"))?, )?; - Ok(Object::Int( - data.windows(sub.len()) - .position(|w| w == sub) - .map_or(-1, |i| i as i64), - )) + let (start, end) = bytes_search_range(args, data.len()); + Ok(Object::Int(bytes_find_in(&data, &sub, start, end))) } fn bytes_rfind(args: &[Object]) -> Result { @@ -4846,9 +4919,16 @@ fn bytes_rfind(args: &[Object]) -> Result { args.get(1) .ok_or_else(|| type_error("rfind() expected 1 arg"))?, )?; + let (start, end) = bytes_search_range(args, data.len()); + if start > end || end > data.len() { + return Ok(Object::Int(-1)); + } + if sub.is_empty() { + return Ok(Object::Int(end as i64)); + } let mut last = -1i64; - if sub.len() <= data.len() { - for i in 0..=data.len() - sub.len() { + if sub.len() <= end - start { + for i in start..=end - sub.len() { if data[i..i + sub.len()] == sub[..] { last = i as i64; } @@ -4870,12 +4950,13 @@ fn bytes_count(args: &[Object]) -> Result { args.get(1) .ok_or_else(|| type_error("count() expected 1 arg"))?, )?; + let (start, end) = bytes_search_range(args, data.len()); if sub.is_empty() { - return Ok(Object::Int(data.len() as i64 + 1)); + return Ok(Object::Int((end - start) as i64 + 1)); } let mut n = 0i64; - let mut i = 0; - while i + sub.len() <= data.len() { + let mut i = start; + while i + sub.len() <= end { if data[i..i + sub.len()] == sub[..] { n += 1; i += sub.len(); @@ -5072,6 +5153,64 @@ fn bytes_replace(args: &[Object]) -> Result { Ok(Object::new_bytes(out)) } +/// `bytes.translate(table, /, delete=b'')` and the `bytearray` +/// equivalent. `table` is `None` (identity) or a bytes-like of length +/// 256; bytes present in `delete` are dropped first. The receiver's +/// type (bytes vs bytearray) is preserved. +fn bytes_translate(args: &[Object]) -> Result { + let data = bytes_data(args)?; + let table = match args.get(1) { + None | Some(Object::None) => None, + Some(o) => { + let t = bytes_argview(o)?; + if t.len() != 256 { + return Err(value_error("translation table must be 256 characters long")); + } + Some(t) + } + }; + let delete = match args.get(2) { + None | Some(Object::None) => Vec::new(), + Some(o) => bytes_argview(o)?, + }; + let mut out = Vec::with_capacity(data.len()); + for &b in &data { + if delete.contains(&b) { + continue; + } + out.push(match &table { + Some(t) => t[b as usize], + None => b, + }); + } + if matches!(args.first(), Some(Object::ByteArray(_))) { + Ok(Object::new_bytearray(out)) + } else { + Ok(Object::new_bytes(out)) + } +} + +/// `bytes.maketrans(from, to)` — builds a 256-byte translation table +/// mapping each byte in `from` to the byte at the same index in `to`. +fn bytes_maketrans(args: &[Object]) -> Result { + let from = bytes_argview( + args.first() + .ok_or_else(|| type_error("maketrans() takes exactly two arguments"))?, + )?; + let to = bytes_argview( + args.get(1) + .ok_or_else(|| type_error("maketrans() takes exactly two arguments"))?, + )?; + if from.len() != to.len() { + return Err(value_error("maketrans arguments must have same length")); + } + let mut table: Vec = (0u8..=255).collect(); + for (f, t) in from.iter().zip(to.iter()) { + table[*f as usize] = *t; + } + Ok(Object::new_bytes(table)) +} + fn bytes_isalnum(args: &[Object]) -> Result { let data = bytes_data(args)?; Ok(Object::Bool( diff --git a/crates/weavepy-vm/src/lib.rs b/crates/weavepy-vm/src/lib.rs index 5a8285f..f23efa4 100644 --- a/crates/weavepy-vm/src/lib.rs +++ b/crates/weavepy-vm/src/lib.rs @@ -1427,14 +1427,15 @@ impl Interpreter { let i = frame.pop()?; let target = frame.pop()?; let value = frame.pop()?; + let g = frame.globals.clone(); if let Object::Instance(_) = &target { if let Some(method) = instance_method(&target, "__setitem__") { - self.call(&method, &[i.clone(), value], &[], &frame.globals.clone())?; + self.call(&method, &[i.clone(), value], &[], &g)?; } else { - self.store_subscr(&target, &i, value)?; + self.store_subscr(&target, &i, value, &g)?; } } else { - self.store_subscr(&target, &i, value)?; + self.store_subscr(&target, &i, value, &g)?; } } OpCode::DeleteSubscr => { @@ -1467,7 +1468,13 @@ impl Interpreter { OpCode::UnaryOp => { let v = frame.pop()?; let kind: UnaryKind = unsafe { std::mem::transmute(ins.arg as u8) }; - let r = unary_op(&v, kind)?; + let r = if matches!(kind, UnaryKind::Not) && matches!(v, Object::Instance(_)) { + // `not obj` must honour __bool__/__len__. + let g = frame.globals.clone(); + Object::Bool(!self.obj_truthy(&v, &g)?) + } else { + unary_op(&v, kind)? + }; frame.push(r); } OpCode::CompareOp => { @@ -1582,13 +1589,27 @@ impl Interpreter { } OpCode::PopJumpIfFalse => { let v = frame.pop()?; - if !v.is_truthy() { + let truthy = match &v { + Object::Instance(_) => { + let g = frame.globals.clone(); + self.obj_truthy(&v, &g)? + } + _ => v.is_truthy(), + }; + if !truthy { frame.pc += ins.arg; } } OpCode::PopJumpIfTrue => { let v = frame.pop()?; - if v.is_truthy() { + let truthy = match &v { + Object::Instance(_) => { + let g = frame.globals.clone(); + self.obj_truthy(&v, &g)? + } + _ => v.is_truthy(), + }; + if truthy { frame.pc += ins.arg; } } @@ -3628,6 +3649,51 @@ impl Interpreter { Ok(Object::Int(v.len()? as i64)) } + /// VM-aware Python truthiness. For instances this dispatches + /// `__bool__` (then `__len__`) so user classes that define either + /// dunder are honoured in boolean contexts; everything else falls + /// back to the pure [`Object::is_truthy`]. Mirrors CPython's + /// `PyObject_IsTrue`. + fn obj_truthy( + &mut self, + v: &Object, + globals: &Rc>, + ) -> Result { + if let Object::Instance(_) = v { + if let Some(method) = instance_method(v, "__bool__") { + let r = self.call(&method, &[], &[], globals)?; + return match r { + Object::Bool(b) => Ok(b), + other => match other.as_i64() { + Some(i) => Ok(i != 0), + None => Err(type_error(format!( + "__bool__ should return bool, returned {}", + other.type_name() + ))), + }, + }; + } + if let Some(method) = instance_method(v, "__len__") { + let r = self.call(&method, &[], &[], globals)?; + return Ok(r.is_truthy()); + } + } + Ok(v.is_truthy()) + } + + /// `bool(x)` constructor — routes through [`Self::obj_truthy`] so a + /// custom `__bool__`/`__len__` is respected. + fn do_bool_call( + &mut self, + args: &[Object], + globals: &Rc>, + ) -> Result { + match args.first() { + None => Ok(Object::Bool(false)), + Some(v) => Ok(Object::Bool(self.obj_truthy(v, globals)?)), + } + } + /// `int(x)` with a fallback to the user-defined `__int__`. Matches /// CPython's coercion rules well enough for the common cases — /// user classes that store an integer payload (enums, ipaddress, @@ -3660,6 +3726,11 @@ impl Interpreter { ))), }; } + // `int` subclass instance with no `__int__` override: + // `int(x)` yields a plain int of the wrapped value. + if let Some(native) = other.native_value() { + return self.do_int_call(&[native], globals); + } Err(type_error(format!( "int() argument must be a string or a real number, not '{}'", other.type_name() @@ -3696,6 +3767,9 @@ impl Interpreter { ))), }; } + if let Some(native) = other.native_value() { + return self.do_float_call(&[native], globals); + } Err(type_error(format!( "float() argument must be a string or a real number, not '{}'", other.type_name() @@ -4181,57 +4255,6 @@ impl Interpreter { Ok(Object::new_list(items)) } - /// VM-routed dispatch for ``re.sub(pattern, repl_callable, text, - /// count=0, flags=0)`` where ``repl`` is a callable. We - /// collect the spans up-front (no VM reentrancy mid-iteration) - /// and then call ``repl(match)`` once per match. - fn do_re_sub_callable( - &mut self, - args: &[Object], - globals: &Rc>, - ) -> Result { - use crate::stdlib::re as remod; - let pat_obj = args - .first() - .ok_or_else(|| type_error("re.sub: missing pattern"))?; - let repl = args - .get(1) - .ok_or_else(|| type_error("re.sub: missing repl"))? - .clone(); - let text = match args.get(2) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("re.sub: expected str text")), - }; - let count = match args.get(3) { - Some(Object::Int(i)) => *i, - _ => 0, - }; - let (pat, default_flags) = remod::extract_pattern_pub(pat_obj)?; - let flags = match args.get(4) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - let matches = remod::collect_all_matches(&pat, flags, &text)?; - let mut out = String::new(); - let mut last_end = 0usize; - for (idx, (s, e, groups)) in matches.iter().enumerate() { - if count > 0 && (idx as i64) >= count { - break; - } - out.push_str(&text[last_end..*s]); - let m_obj = remod::build_match_object(&pat, &text, groups, *s, *e); - let ret = self.call_object(repl.clone(), &[m_obj], &[])?; - match ret { - Object::Str(rs) => out.push_str(&rs), - _ => return Err(type_error("re.sub callable must return str")), - } - last_end = *e; - } - out.push_str(&text[last_end..]); - let _ = globals; - Ok(Object::from_str(out)) - } - fn do_list_sort_call( &mut self, args: &[Object], @@ -4330,6 +4353,27 @@ impl Interpreter { if let Some(method) = instance_method(v, "__iter__") { return self.call(&method, &[], &[], globals); } + // Legacy sequence protocol: an object that defines + // `__getitem__` but no `__iter__` is still iterable — + // CPython calls `obj[0]`, `obj[1]`, … until `IndexError`. + // We materialise eagerly into a list (consistent with the + // `iter(callable, sentinel)` path above); the wrapped + // sequences this serves — `re`'s `SubPattern`, simple + // user containers — are finite and side-effect-free. + if let Some(getitem) = instance_method(v, "__getitem__") { + let mut out: Vec = Vec::new(); + let mut i: i64 = 0; + loop { + match self.call(&getitem, &[Object::Int(i)], &[], globals) { + Ok(val) => out.push(val), + Err(e) if is_index_error(&e) => break, + Err(e) => return Err(e), + } + i += 1; + } + let list = Object::new_list(out); + return self.make_iter(&list, globals); + } Err(type_error(format!( "'{}' object is not iterable", v.type_name_owned() @@ -4997,6 +5041,33 @@ impl Interpreter { if let Some(method) = instance_method(b, rdunder) { return self.call(&method, std::slice::from_ref(a), &[], globals); } + // `str % args`: route through a VM-aware formatter so `%s` / `%r` + // of user instances dispatch `__str__` / `__repr__` (e.g. + // `"err: %s" % some_exception`). Other `%` operand types fall + // through to the pure `binary_op` path. + if matches!(op, BinOpKind::Mod) { + if let Object::Str(template) = a { + let template = template.clone(); + let mut resolve = + |obj: &Object, kind: char| -> Result, RuntimeError> { + if let Object::Instance(_) = obj { + let s = match kind { + 's' => self.stringify(obj, globals)?, + 'r' => self.repr_of(obj, globals)?, + _ => return Ok(None), + }; + Ok(Some(s)) + } else { + Ok(None) + } + }; + return Ok(Object::from_str(percent_format_with( + &template, + b, + &mut resolve, + )?)); + } + } binary_op(a, b, op) } @@ -6108,6 +6179,15 @@ impl Interpreter { container: &Object, index: &Object, ) -> Result { + // An `int` subclass instance used as an index (`xs[op]` where + // `op` is e.g. a `_NamedIntConstant`) acts as its int value. + let unwrapped = match index { + Object::Instance(_) => index + .native_value() + .filter(|n| matches!(n, Object::Int(_) | Object::Long(_) | Object::Bool(_))), + _ => None, + }; + let index = unwrapped.as_ref().unwrap_or(index); match (container, index) { (Object::List(items), Object::Int(i)) => { let items = items.borrow(); @@ -6149,6 +6229,18 @@ impl Interpreter { let s: String = sliced.iter().map(|o| o.to_str()).collect(); Ok(Object::from_str(s)) } + (Object::Range(r), Object::Int(i)) => { + let len = container.len()? as i64; + let idx = if *i < 0 { *i + len } else { *i }; + if idx < 0 || idx >= len { + return Err(index_error("range object index out of range")); + } + Ok(Object::Int(r.start + idx * r.step)) + } + (Object::Range(r), Object::Slice(slc)) => { + let len = container.len()? as i64; + range_slice(r, len, slc) + } (Object::Bytes(buf), Object::Int(i)) => { let idx = normalize_index(*i, buf.len())?; Ok(Object::Int(i64::from(buf[idx]))) @@ -6223,10 +6315,11 @@ impl Interpreter { } fn store_subscr( - &self, + &mut self, container: &Object, index: &Object, value: Object, + globals: &Rc>, ) -> Result<(), RuntimeError> { match (container, index) { (Object::List(items), Object::Int(i)) => { @@ -6239,22 +6332,17 @@ impl Interpreter { // CPython: `xs[start:stop:step] = iterable`. We // collect the RHS, then splice in place. Supporting // strided slice assignment requires that `len(rhs)` - // matches the slice width. - let replacement = match value { + // matches the slice width. The RHS is collected via the + // full VM iteration protocol so objects that are only + // legacy-iterable (`__getitem__`, no `__iter__`) work. + let replacement = match &value { Object::List(l) => l.borrow().clone(), Object::Tuple(t) => t.iter().cloned().collect::>(), - Object::Str(ref txt) => txt + Object::Str(txt) => txt .chars() .map(|c| Object::from_str(c.to_string())) .collect(), - other => { - let mut buf = Vec::new(); - let mut it = other.make_iter()?; - while let Some(v) = it.next_value() { - buf.push(v); - } - buf - } + _ => self.collect_iterable(&value, globals)?, }; let mut data = items.borrow_mut(); apply_slice_assignment(&mut data, s, replacement)?; @@ -6289,12 +6377,31 @@ impl Interpreter { items.remove(idx); Ok(()) } + (Object::List(items), Object::Slice(s)) => { + apply_slice_deletion(&mut items.borrow_mut(), s) + } (Object::Dict(d), key) => { if d.borrow_mut().shift_remove(&DictKey(key.clone())).is_none() { return Err(key_error(key.repr())); } Ok(()) } + (Object::ByteArray(b), Object::Int(i)) => { + let mut b = b.borrow_mut(); + let idx = normalize_index(*i, b.len())?; + b.remove(idx); + Ok(()) + } + (Object::ByteArray(b), Object::Slice(s)) => { + let mut b = b.borrow_mut(); + let mut indices = slice_indices(b.len(), s)?; + indices.sort_unstable(); + indices.dedup(); + for idx in indices.into_iter().rev() { + b.remove(idx); + } + Ok(()) + } _ => Err(type_error(format!( "'{}' object does not support item deletion", container.type_name() @@ -6344,6 +6451,9 @@ impl Interpreter { if b.name == "len" && args.len() == 1 { return self.do_len_call(&args[0], outer_globals); } + if b.name == "bool" && args.len() <= 1 { + return self.do_bool_call(args, outer_globals); + } if b.name == "int" && args.len() <= 2 { return self.do_int_call(args, outer_globals); } @@ -6488,20 +6598,6 @@ impl Interpreter { if (b.name == "min" || b.name == "max") && !args.is_empty() { return self.do_min_max_call(b.name, args, kwargs, outer_globals); } - // ``re.sub(pat, repl, text, count=0, flags=0)`` - // accepts a callable ``repl``; routing it through the - // VM lets the callback invoke arbitrary user code. - if b.name == "sub" && args.len() >= 3 { - let callable_repl = matches!( - args.get(1), - Some(Object::Function(_)) - | Some(Object::Builtin(_)) - | Some(Object::BoundMethod(_)) - ); - if callable_repl { - return self.do_re_sub_callable(args, outer_globals); - } - } // `format`'s dispatching: when args[0] is a string we // assume this is `"...".format(...)` (str_format // builtin) and pass kwargs through. Otherwise fall @@ -7173,6 +7269,11 @@ impl Interpreter { let global_dummy = Rc::new(RefCell::new(DictData::new())); return self.do_float_call(args, &global_dummy); } + // `bool(x)` must consult __bool__/__len__ for instances. + if cls.name == "bool" && args.len() <= 1 && kwargs.is_empty() { + let global_dummy = Rc::new(RefCell::new(DictData::new())); + return self.do_bool_call(args, &global_dummy); + } if let Some(builtin) = self.builtin_constructor_for(&cls) { if !kwargs.is_empty() { return Err(type_error(format!( @@ -7548,7 +7649,24 @@ impl Interpreter { IC::CallPyExactNoFree { func_id, argc: ca } => { if ca as usize == argc { if let Object::Function(f) = &callable { - if specialize::rc_id(f) == func_id && args.len() == argc { + // `func_id` is a raw pointer fingerprint and can + // alias a *different* function after the original + // was freed and its allocation reused (ABA). Re- + // verify the shape this fast path assumes — exact + // arity, no cells/closure — so a recycled address + // can never run an incompatible function through + // the no-free path (which skips defaults & cells). + let code = &f.code; + if specialize::rc_id(f) == func_id + && args.len() == argc + && code.arg_count as usize == argc + && !code.has_varargs + && !code.has_varkeywords + && code.kwonly_count == 0 + && code.cellvars.is_empty() + && code.freevars.is_empty() + && f.closure.is_empty() + { specialize::record_hit(op_idx); let f = f.clone(); let r = self.run_py_exact_nofree(&f, args)?; @@ -7562,7 +7680,17 @@ impl Interpreter { IC::CallPyExact { func_id, argc: ca } => { if ca as usize == argc { if let Object::Function(f) = &callable { - if specialize::rc_id(f) == func_id && args.len() == argc { + // Same ABA guard as above: confirm exact arity + // before taking the binding-free path (cells are + // rebuilt from `f.code`, so they stay correct). + let code = &f.code; + if specialize::rc_id(f) == func_id + && args.len() == argc + && code.arg_count as usize == argc + && !code.has_varargs + && !code.has_varkeywords + && code.kwonly_count == 0 + { specialize::record_hit(op_idx); let f = f.clone(); let r = self.run_py_exact_with_cells(&f, args)?; @@ -8416,6 +8544,126 @@ fn apply_slice_assignment( Ok(()) } +/// Compute the concrete indices covered by `s` over a sequence of +/// length `len` (CPython's `PySlice_Unpack` + `PySlice_AdjustIndices`), +/// returned in iteration order. +fn slice_indices(len: usize, s: &PySlice) -> Result, RuntimeError> { + let len = len as i64; + let step = match &s.step { + Object::None => 1i64, + Object::Int(i) => *i, + _ => return Err(type_error("slice indices must be integers or None")), + }; + if step == 0 { + return Err(value_error("slice step cannot be zero")); + } + let (lower, upper) = if step < 0 { + (-1i64, len - 1) + } else { + (0i64, len) + }; + // Resolve a bound: `None` falls back to its default sentinel + // directly (never re-mapped through the negative-index rule), while + // explicit values are wrapped (`+= len`) then clamped to [lower, upper]. + let resolve = |o: &Object, default: i64| -> Result { + match o { + Object::None => Ok(default), + Object::Int(i) => { + let v = if *i < 0 { *i + len } else { *i }; + Ok(v.clamp(lower, upper)) + } + _ => Err(type_error("slice indices must be integers or None")), + } + }; + let mut i = resolve(&s.start, if step > 0 { 0 } else { len - 1 })?; + let stop = resolve(&s.stop, if step > 0 { len } else { -1 })?; + let mut out = Vec::new(); + while (step > 0 && i < stop) || (step < 0 && i > stop) { + if i >= 0 && (i as usize) < len as usize { + out.push(i as usize); + } + i += step; + } + Ok(out) +} + +/// CPython's `PySlice_Unpack` + `PySlice_AdjustIndices`: resolve a slice +/// against a sequence of length `len`, returning +/// `(start, stop, step, slicelength)` with the same clamping rules. +fn adjust_slice(len: i64, s: &PySlice) -> Result<(i64, i64, i64, i64), RuntimeError> { + let step = match &s.step { + Object::None => 1i64, + Object::Int(i) => *i, + _ => return Err(type_error("slice indices must be integers or None")), + }; + if step == 0 { + return Err(value_error("slice step cannot be zero")); + } + let (lower, upper) = if step < 0 { + (-1i64, len - 1) + } else { + (0i64, len) + }; + let clamp = |o: &Object, dflt: i64| -> Result { + match o { + Object::None => Ok(dflt), + Object::Int(i) => { + let mut x = *i; + if x < 0 { + x += len; + if x < lower { + x = lower; + } + } else if x > upper { + x = upper; + } + Ok(x) + } + _ => Err(type_error("slice indices must be integers or None")), + } + }; + let start = clamp(&s.start, if step < 0 { upper } else { lower })?; + let stop = clamp(&s.stop, if step < 0 { lower } else { upper })?; + let slicelength = if step < 0 { + if stop < start { + (start - stop - 1) / (-step) + 1 + } else { + 0 + } + } else if start < stop { + (stop - start - 1) / step + 1 + } else { + 0 + }; + Ok((start, stop, step, slicelength.max(0))) +} + +/// `range(...)[slice]` → a new range, mirroring CPython `compute_slice`. +fn range_slice(r: &crate::object::Range, len: i64, s: &PySlice) -> Result { + let (start, _stop, step, slicelen) = adjust_slice(len, s)?; + let new_start = r.start + start * r.step; + let new_step = r.step * step; + let new_stop = new_start + slicelen * new_step; + Ok(Object::Range(Rc::new(crate::object::Range { + start: new_start, + stop: new_stop, + step: new_step, + }))) +} + +/// `del data[start:stop:step]` — remove the slice members in place. +fn apply_slice_deletion(data: &mut Vec, s: &PySlice) -> Result<(), RuntimeError> { + let mut indices = slice_indices(data.len(), s)?; + // Remove from highest index to lowest so earlier removals don't + // shift the positions still to be deleted. + indices.sort_unstable(); + indices.dedup(); + for idx in indices.into_iter().rev() { + data.remove(idx); + } + Ok(()) +} + fn slice_seq(seq: &[Object], s: &PySlice) -> Result, RuntimeError> { let len = seq.len() as i64; let step = match &s.step { @@ -8430,43 +8678,76 @@ fn slice_seq(seq: &[Object], s: &PySlice) -> Result, RuntimeError> { if step == 0 { return Err(value_error("slice step cannot be zero")); } - let extract = |o: &Object, default: i64| -> Result { - match o { - Object::None => Ok(default), - Object::Int(i) => Ok(*i), - _ => Err(type_error( - "slice indices must be integers or None or have an __index__ method", - )), - } - }; - let start = extract(&s.start, if step > 0 { 0 } else { len - 1 })?; - let stop = extract(&s.stop, if step > 0 { len } else { -1 })?; - let norm = |x: i64| -> i64 { + // Map an *explicit* index to a concrete one, mirroring CPython's + // `PySlice_AdjustIndices`. The clamp floor differs by step sign: a + // negative step can legitimately walk down to index -1 (one below + // the start of the sequence), whereas a positive step floors at 0. + let adjust = |x: i64| -> i64 { if x < 0 { - let n = x + len; - if n < 0 && step > 0 { - 0 + let v = x + len; + if v < 0 { + if step < 0 { + -1 + } else { + 0 + } + } else { + v + } + } else if x >= len { + if step < 0 { + len - 1 } else { - n + len } - } else if x > len { - len } else { x } }; - let mut i = norm(start); - let stop_norm = norm(stop); + // Defaults for an omitted bound use sentinels that must *not* pass + // through `adjust` (e.g. an omitted `stop` with a negative step is + // -1, meaning "below index 0", not "the last element"). + let start = match &s.start { + Object::None => { + if step < 0 { + len - 1 + } else { + 0 + } + } + Object::Int(i) => adjust(*i), + _ => { + return Err(type_error( + "slice indices must be integers or None or have an __index__ method", + )) + } + }; + let stop = match &s.stop { + Object::None => { + if step < 0 { + -1 + } else { + len + } + } + Object::Int(i) => adjust(*i), + _ => { + return Err(type_error( + "slice indices must be integers or None or have an __index__ method", + )) + } + }; + let mut i = start; let mut out = Vec::new(); if step > 0 { - while i < stop_norm { + while i < stop { if (0..len).contains(&i) { out.push(seq[i as usize].clone()); } i += step; } } else { - while i > stop_norm { + while i > stop { if (0..len).contains(&i) { out.push(seq[i as usize].clone()); } @@ -9098,6 +9379,20 @@ fn bytes_percent_args(value: &Object) -> Object { } pub(crate) fn percent_format(template: &str, value: &Object) -> Result { + let mut noop = |_: &Object, _: char| Ok(None); + percent_format_with(template, value, &mut noop) +} + +/// Printf-style `%` formatting with a VM-supplied `resolve` callback. +/// +/// `resolve(item, kind)` lets the caller render `%s` / `%r` of user +/// instances through `__str__` / `__repr__` (returning `Some(rendered)`), +/// falling back to the built-in conversion when it returns `None`. +pub(crate) fn percent_format_with( + template: &str, + value: &Object, + resolve: &mut dyn FnMut(&Object, char) -> Result, RuntimeError>, +) -> Result { let mut out = String::new(); let bytes = template.as_bytes(); let mut i = 0; @@ -9219,13 +9514,36 @@ pub(crate) fn percent_format(template: &str, value: &Object) -> Result format_via_spec(&Object::from_str(item.to_str()), &spec)?, - 'r' => format_via_spec(&Object::from_str(item.repr()), &spec.replace('r', "s"))?, + 's' => { + let s = match resolve(&item, 's')? { + Some(s) => s, + None => item.to_str(), + }; + format_via_spec(&Object::from_str(s), &spec)? + } + 'r' => { + let s = match resolve(&item, 'r')? { + Some(s) => s, + None => item.repr(), + }; + format_via_spec(&Object::from_str(s), &spec.replace('r', "s"))? + } 'a' => format_via_spec( &Object::from_str(ascii_repr(&item)), &spec.replace('a', "s"), )?, - 'd' | 'i' | 'u' => format_via_spec(&item, &spec.replace(['i', 'u'], "d"))?, + 'd' | 'i' | 'u' => { + // Unwrap `int` subclasses (enum members, _NamedIntConstant) + // so `%d` sees a real integer rather than the instance. + let numeric = match &item { + Object::Instance(_) => match item.as_i64() { + Some(n) => Object::Int(n), + None => item.clone(), + }, + _ => item.clone(), + }; + format_via_spec(&numeric, &spec.replace(['i', 'u'], "d"))? + } 'b' | 'o' | 'x' | 'X' => format_via_spec(&item, &spec)?, 'f' | 'F' | 'e' | 'E' | 'g' | 'G' => format_via_spec(&item, &spec)?, 'c' => match &item { @@ -9699,6 +10017,19 @@ fn group_decimal(mag: u64, sep: char) -> String { out } +/// Is `e` an `IndexError` (or subclass)? Used by the legacy +/// `__getitem__` iteration protocol to detect the end of a sequence. +fn is_index_error(e: &RuntimeError) -> bool { + if let RuntimeError::PyException(pe) = e { + if let Object::Instance(inst) = &pe.instance { + return inst + .class + .is_subclass_of(&crate::builtin_types::builtin_types().index_error); + } + } + false +} + fn binop_dunders(op: BinOpKind) -> (&'static str, &'static str) { use BinOpKind as B; match op { @@ -9898,8 +10229,15 @@ fn object_to_constant(o: &Object) -> Constant { fn binary_op(a: &Object, b: &Object, op: BinOpKind) -> Result { use BinOpKind as B; use Object as O; + // Subclasses of immutable built-ins (`class C(int)`, `enum.IntEnum`, + // `_NamedIntConstant`, …) behave like the value they wrap. By the + // time we reach this primitive path the caller has already ruled + // out any user `__op__` / `__rop__` override, so unwrapping to the + // native value is the correct (and CPython-matching) fallback. + let a = a.native_value().unwrap_or_else(|| a.clone()); + let b = b.native_value().unwrap_or_else(|| b.clone()); // Promote bool → int where appropriate. - let (a, b) = (promote_bool(a), promote_bool(b)); + let (a, b) = (promote_bool(&a), promote_bool(&b)); // Numeric tower: any (int-like, int-like) arithmetic routes // through the bignum-aware path with i64 fast-track and overflow diff --git a/crates/weavepy-vm/src/object.rs b/crates/weavepy-vm/src/object.rs index 2f5720c..c6cfe23 100644 --- a/crates/weavepy-vm/src/object.rs +++ b/crates/weavepy-vm/src/object.rs @@ -529,6 +529,12 @@ impl Eq for DictKey {} impl Hash for DictKey { fn hash(&self, state: &mut H) { + // An `int`/`str`/… subclass instance hashes identically to the + // value it wraps, so it can be used interchangeably with that + // value as a dict/set key (CPython invariant). + if let Some(native) = self.0.native_value() { + return DictKey(native).hash(state); + } match &self.0 { Object::None => 0u8.hash(state), Object::Bool(b) => { @@ -1207,6 +1213,14 @@ impl Object { Object::FrozenSet(s) => !s.is_empty(), Object::Cell(inner) => inner.borrow().is_truthy(), Object::Instance(inst) => { + // int/str/… subclass instances are truthy per their + // wrapped value unless the class overrides __bool__/__len__. + if inst.class.lookup("__bool__").is_none() && inst.class.lookup("__len__").is_none() + { + if let Some(native) = &inst.native { + return native.is_truthy(); + } + } // Honour __bool__ then __len__ before defaulting to True. if let Some(m) = inst.class.lookup("__bool__") { // Caller dispatches; we cannot run Python here. @@ -1282,6 +1296,17 @@ impl Object { /// `==` operator semantics — recursive value equality. pub fn eq_value(&self, other: &Self) -> bool { + // Subclasses of immutable built-ins (`class C(int)`, + // `enum.IntEnum`, `_NamedIntConstant`, …) compare by the value + // they wrap, so `C(5) == 5` and two distinct instances with the + // same value are equal — exactly like CPython. + let lhs_native = self.native_value(); + let rhs_native = other.native_value(); + if lhs_native.is_some() || rhs_native.is_some() { + let l = lhs_native.as_ref().unwrap_or(self); + let r = rhs_native.as_ref().unwrap_or(other); + return l.eq_value(r); + } match (self, other) { (Object::None, Object::None) => true, (Object::Bool(a), Object::Bool(b)) => a == b, @@ -1366,6 +1391,14 @@ impl Object { /// combinations return [`Err`] mapping to Python's `TypeError`. pub fn cmp(&self, other: &Self) -> Result { use Object as O; + // Order `int`/`str`/… subclass instances by the value they wrap. + let lhs_native = self.native_value(); + let rhs_native = other.native_value(); + if lhs_native.is_some() || rhs_native.is_some() { + let l = lhs_native.as_ref().unwrap_or(self); + let r = rhs_native.as_ref().unwrap_or(other); + return l.cmp(r); + } match (self, other) { (O::Int(a), O::Int(b)) => Ok(a.cmp(b)), (O::Long(a), O::Long(b)) => Ok((**a).cmp(b)), @@ -1677,19 +1710,42 @@ impl Object { } } Object::Str(s) => { + // CPython quote selection (Objects/unicodeobject.c + // `unicode_repr`): use '\'' unless the string contains a + // single quote and no double quote, in which case use '"' + // so the single quotes need not be escaped. + let has_single = s.contains('\''); + let has_double = s.contains('"'); + let quote = if has_single && !has_double { '"' } else { '\'' }; let mut out = String::with_capacity(s.len() + 2); - out.push('\''); + out.push(quote); for c in s.chars() { match c { '\\' => out.push_str("\\\\"), - '\'' => out.push_str("\\'"), '\n' => out.push_str("\\n"), '\r' => out.push_str("\\r"), '\t' => out.push_str("\\t"), - c => out.push(c), + c if c == quote => { + out.push('\\'); + out.push(quote); + } + c if char_is_printable(c) => out.push(c), + // Non-printable code points are escaped the way + // CPython's `unicode_repr` does: \xNN, \uNNNN or + // \UNNNNNNNN depending on the code-point width. + c => { + let n = c as u32; + if n <= 0xff { + out.push_str(&format!("\\x{n:02x}")); + } else if n <= 0xffff { + out.push_str(&format!("\\u{n:04x}")); + } else { + out.push_str(&format!("\\U{n:08x}")); + } + } } } - out.push('\''); + out.push(quote); out } Object::Tuple(items) => { @@ -2021,6 +2077,28 @@ fn bytes_contains(haystack: &[u8], needle: &[u8]) -> bool { haystack.windows(needle.len()).any(|w| w == needle) } +/// CPython's `Py_UNICODE_ISPRINTABLE`: every character is printable +/// except those in the "Other" (Cc, Cf, Cs, Co, Cn) and "Separator" +/// (Zl, Zp, Zs) general categories, with U+0020 (space) treated as +/// printable. Used by `repr(str)` (and `str.isprintable`). +pub(crate) fn char_is_printable(c: char) -> bool { + if c == ' ' { + return true; + } + use unicode_properties::{GeneralCategory as GC, UnicodeGeneralCategory}; + !matches!( + c.general_category(), + GC::Control + | GC::Format + | GC::Surrogate + | GC::PrivateUse + | GC::Unassigned + | GC::LineSeparator + | GC::ParagraphSeparator + | GC::SpaceSeparator + ) +} + fn bytes_repr(b: &[u8]) -> String { let mut out = String::with_capacity(b.len() + 3); out.push('b'); @@ -2174,11 +2252,25 @@ impl Object { /// View this object as `i64`, succeeding only when the value /// genuinely fits in 64 bits. Returns `None` for `Long`s that /// don't fit, and for non-integer types. + /// For an instance of a subclass of a primitive built-in + /// (`int`, `str`, …) return a clone of the underlying value the + /// instance wraps; `None` for everything else. The wrapped value + /// is always itself a primitive (never another `Instance`), so + /// callers can recurse exactly once. + #[inline] + pub fn native_value(&self) -> Option { + match self { + Object::Instance(inst) => inst.native.clone(), + _ => None, + } + } + pub fn as_i64(&self) -> Option { match self { Object::Bool(b) => Some(i64::from(*b)), Object::Int(i) => Some(*i), Object::Long(b) => b.to_i64(), + Object::Instance(inst) => inst.native.as_ref().and_then(Object::as_i64), _ => None, } } @@ -2190,6 +2282,7 @@ impl Object { Object::Bool(b) => Some(usize::from(*b)), Object::Int(i) if *i >= 0 => usize::try_from(*i).ok(), Object::Long(b) if !b.is_negative() => b.to_usize(), + Object::Instance(inst) => inst.native.as_ref().and_then(Object::as_usize), _ => None, } } diff --git a/crates/weavepy-vm/src/stdlib/mod.rs b/crates/weavepy-vm/src/stdlib/mod.rs index 3ba6b93..dec2f2f 100644 --- a/crates/weavepy-vm/src/stdlib/mod.rs +++ b/crates/weavepy-vm/src/stdlib/mod.rs @@ -39,7 +39,6 @@ pub mod marshal_mod; pub mod math; pub mod os; pub mod random; -pub mod re; pub mod resource_mod; pub mod secrets_mod; pub mod select_mod; @@ -47,6 +46,7 @@ pub mod shutil_mod; pub mod signal_mod; pub mod socket_mod; pub mod sqlite3_mod; +pub mod sre_mod; pub mod ssl_mod; pub mod struct_mod; pub mod subprocess_mod; @@ -89,7 +89,6 @@ pub fn register_all(cache: &ModuleCache) { cache.register_builtin("os", os::build); cache.register_builtin("os.path", os::build_path); cache.register_builtin("io", io::build); - cache.register_builtin("re", re::build); cache.register_builtin("json", json::build); cache.register_builtin("random", random::build); cache.register_builtin("time", time::build); @@ -114,6 +113,8 @@ pub fn register_all(cache: &ModuleCache) { cache.register_builtin("_struct", struct_mod::build); cache.register_builtin("_codecs", codecs_mod::build); cache.register_builtin("marshal", marshal_mod::build); + // RFC 0035 — native SRE regex core behind the frozen `re` package. + cache.register_builtin("_sre", sre_mod::build); // RFC 0033 — native AST parsing core behind the frozen `ast` module. cache.register_builtin("_ast", ast_mod::build); // RFC 0033 — native symbol-table core behind the frozen `symtable` module. @@ -992,5 +993,56 @@ fn frozen_sources() -> &'static [FrozenSource] { source: include_str!("python/symtable.py"), is_package: false, }, + // RFC 0035 — the `re` package: a faithful port of CPython's + // secret-labs engine. `_constants` / `_parser` / `_compiler` / + // `_casefix` are verbatim from CPython 3.13; `_engine` builds the + // Pattern / Match objects on top of the native `_sre` core. + FrozenSource { + name: "re", + source: include_str!("python/re_init.py"), + is_package: true, + }, + FrozenSource { + name: "re._constants", + source: include_str!("python/re_constants.py"), + is_package: false, + }, + FrozenSource { + name: "re._casefix", + source: include_str!("python/re_casefix.py"), + is_package: false, + }, + FrozenSource { + name: "re._parser", + source: include_str!("python/re_parser.py"), + is_package: false, + }, + FrozenSource { + name: "re._compiler", + source: include_str!("python/re_compiler.py"), + is_package: false, + }, + FrozenSource { + name: "re._engine", + source: include_str!("python/re_engine.py"), + is_package: false, + }, + // Deprecated 3.x aliases kept for compatibility with code that + // still imports the pre-3.11 module names. + FrozenSource { + name: "sre_constants", + source: include_str!("python/sre_constants.py"), + is_package: false, + }, + FrozenSource { + name: "sre_parse", + source: include_str!("python/sre_parse.py"), + is_package: false, + }, + FrozenSource { + name: "sre_compile", + source: include_str!("python/sre_compile.py"), + is_package: false, + }, ] } diff --git a/crates/weavepy-vm/src/stdlib/python/enum.py b/crates/weavepy-vm/src/stdlib/python/enum.py index 032a84d..f0d8520 100644 --- a/crates/weavepy-vm/src/stdlib/python/enum.py +++ b/crates/weavepy-vm/src/stdlib/python/enum.py @@ -166,7 +166,14 @@ def __members__(cls): return dict(cls._member_map_) if cls._member_map_ is not None else {} def _create_member_(cls, name, value): - member = object.__new__(cls) + # For int-backed enums (IntEnum/IntFlag) build a real int + # instance so members *are* ints — `IntEnum.X + 1`, `flags & + # member`, `int(member)` and dict/set interchange with the + # bare value all work exactly as in CPython. + if isinstance(value, int) and issubclass(cls, int): + member = int.__new__(cls, value) + else: + member = object.__new__(cls) member._name_ = name member._value_ = value return member @@ -217,12 +224,10 @@ def __hash__(self): return hash(self._name_) -class IntEnum(Enum): - """Mirror of :class:`Enum` whose members compare equal to their - integer value. (CPython inherits from ``int`` directly; WeavePy - keeps a separate base and overloads ``__eq__`` / ``__int__`` to - cover the common patterns.) - """ +class IntEnum(int, Enum): + """Mirror of :class:`Enum` whose members are also genuine ints, so + they compare and operate exactly like their integer value + (CPython's ``class IntEnum(int, Enum)``).""" def __int__(self): return self._value_ @@ -308,7 +313,10 @@ def _decompose_flag(cls, value): combined_value |= member._value_ if combined_value != value: raise ValueError(f"{value!r} is not a valid {cls.__name__}") - new_member = object.__new__(cls) + if issubclass(cls, int): + new_member = int.__new__(cls, value) + else: + new_member = object.__new__(cls) new_member._name_ = "|".join(combined_name) new_member._value_ = value return new_member @@ -345,8 +353,9 @@ def __bool__(self): return bool(self._value_) -class IntFlag(Flag): - """Like :class:`IntEnum` but for bitfield-style values.""" +class IntFlag(int, Flag): + """Like :class:`IntEnum` but for bitfield-style values; members are + genuine ints (CPython's ``class IntFlag(int, Flag)``).""" def __int__(self): return self._value_ diff --git a/crates/weavepy-vm/src/stdlib/python/re_casefix.py b/crates/weavepy-vm/src/stdlib/python/re_casefix.py new file mode 100644 index 0000000..fed2d84 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_casefix.py @@ -0,0 +1,106 @@ +# Auto-generated by Tools/build/generate_re_casefix.py. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { + # LATIN SMALL LETTER I: LATIN SMALL LETTER DOTLESS I + 0x0069: (0x0131,), # 'i': 'ı' + # LATIN SMALL LETTER S: LATIN SMALL LETTER LONG S + 0x0073: (0x017f,), # 's': 'ſ' + # MICRO SIGN: GREEK SMALL LETTER MU + 0x00b5: (0x03bc,), # 'µ': 'μ' + # LATIN SMALL LETTER DOTLESS I: LATIN SMALL LETTER I + 0x0131: (0x0069,), # 'ı': 'i' + # LATIN SMALL LETTER LONG S: LATIN SMALL LETTER S + 0x017f: (0x0073,), # 'ſ': 's' + # COMBINING GREEK YPOGEGRAMMENI: GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + 0x0345: (0x03b9, 0x1fbe), # '\u0345': 'ιι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + 0x0390: (0x1fd3,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + 0x03b0: (0x1fe3,), # 'ΰ': 'ΰ' + # GREEK SMALL LETTER BETA: GREEK BETA SYMBOL + 0x03b2: (0x03d0,), # 'β': 'ϐ' + # GREEK SMALL LETTER EPSILON: GREEK LUNATE EPSILON SYMBOL + 0x03b5: (0x03f5,), # 'ε': 'ϵ' + # GREEK SMALL LETTER THETA: GREEK THETA SYMBOL + 0x03b8: (0x03d1,), # 'θ': 'ϑ' + # GREEK SMALL LETTER IOTA: COMBINING GREEK YPOGEGRAMMENI, GREEK PROSGEGRAMMENI + 0x03b9: (0x0345, 0x1fbe), # 'ι': '\u0345ι' + # GREEK SMALL LETTER KAPPA: GREEK KAPPA SYMBOL + 0x03ba: (0x03f0,), # 'κ': 'ϰ' + # GREEK SMALL LETTER MU: MICRO SIGN + 0x03bc: (0x00b5,), # 'μ': 'µ' + # GREEK SMALL LETTER PI: GREEK PI SYMBOL + 0x03c0: (0x03d6,), # 'π': 'ϖ' + # GREEK SMALL LETTER RHO: GREEK RHO SYMBOL + 0x03c1: (0x03f1,), # 'ρ': 'ϱ' + # GREEK SMALL LETTER FINAL SIGMA: GREEK SMALL LETTER SIGMA + 0x03c2: (0x03c3,), # 'ς': 'σ' + # GREEK SMALL LETTER SIGMA: GREEK SMALL LETTER FINAL SIGMA + 0x03c3: (0x03c2,), # 'σ': 'ς' + # GREEK SMALL LETTER PHI: GREEK PHI SYMBOL + 0x03c6: (0x03d5,), # 'φ': 'ϕ' + # GREEK BETA SYMBOL: GREEK SMALL LETTER BETA + 0x03d0: (0x03b2,), # 'ϐ': 'β' + # GREEK THETA SYMBOL: GREEK SMALL LETTER THETA + 0x03d1: (0x03b8,), # 'ϑ': 'θ' + # GREEK PHI SYMBOL: GREEK SMALL LETTER PHI + 0x03d5: (0x03c6,), # 'ϕ': 'φ' + # GREEK PI SYMBOL: GREEK SMALL LETTER PI + 0x03d6: (0x03c0,), # 'ϖ': 'π' + # GREEK KAPPA SYMBOL: GREEK SMALL LETTER KAPPA + 0x03f0: (0x03ba,), # 'ϰ': 'κ' + # GREEK RHO SYMBOL: GREEK SMALL LETTER RHO + 0x03f1: (0x03c1,), # 'ϱ': 'ρ' + # GREEK LUNATE EPSILON SYMBOL: GREEK SMALL LETTER EPSILON + 0x03f5: (0x03b5,), # 'ϵ': 'ε' + # CYRILLIC SMALL LETTER VE: CYRILLIC SMALL LETTER ROUNDED VE + 0x0432: (0x1c80,), # 'в': 'ᲀ' + # CYRILLIC SMALL LETTER DE: CYRILLIC SMALL LETTER LONG-LEGGED DE + 0x0434: (0x1c81,), # 'д': 'ᲁ' + # CYRILLIC SMALL LETTER O: CYRILLIC SMALL LETTER NARROW O + 0x043e: (0x1c82,), # 'о': 'ᲂ' + # CYRILLIC SMALL LETTER ES: CYRILLIC SMALL LETTER WIDE ES + 0x0441: (0x1c83,), # 'с': 'ᲃ' + # CYRILLIC SMALL LETTER TE: CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x0442: (0x1c84, 0x1c85), # 'т': 'ᲄᲅ' + # CYRILLIC SMALL LETTER HARD SIGN: CYRILLIC SMALL LETTER TALL HARD SIGN + 0x044a: (0x1c86,), # 'ъ': 'ᲆ' + # CYRILLIC SMALL LETTER YAT: CYRILLIC SMALL LETTER TALL YAT + 0x0463: (0x1c87,), # 'ѣ': 'ᲇ' + # CYRILLIC SMALL LETTER ROUNDED VE: CYRILLIC SMALL LETTER VE + 0x1c80: (0x0432,), # 'ᲀ': 'в' + # CYRILLIC SMALL LETTER LONG-LEGGED DE: CYRILLIC SMALL LETTER DE + 0x1c81: (0x0434,), # 'ᲁ': 'д' + # CYRILLIC SMALL LETTER NARROW O: CYRILLIC SMALL LETTER O + 0x1c82: (0x043e,), # 'ᲂ': 'о' + # CYRILLIC SMALL LETTER WIDE ES: CYRILLIC SMALL LETTER ES + 0x1c83: (0x0441,), # 'ᲃ': 'с' + # CYRILLIC SMALL LETTER TALL TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x1c84: (0x0442, 0x1c85), # 'ᲄ': 'тᲅ' + # CYRILLIC SMALL LETTER THREE-LEGGED TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE + 0x1c85: (0x0442, 0x1c84), # 'ᲅ': 'тᲄ' + # CYRILLIC SMALL LETTER TALL HARD SIGN: CYRILLIC SMALL LETTER HARD SIGN + 0x1c86: (0x044a,), # 'ᲆ': 'ъ' + # CYRILLIC SMALL LETTER TALL YAT: CYRILLIC SMALL LETTER YAT + 0x1c87: (0x0463,), # 'ᲇ': 'ѣ' + # CYRILLIC SMALL LETTER UNBLENDED UK: CYRILLIC SMALL LETTER MONOGRAPH UK + 0x1c88: (0xa64b,), # 'ᲈ': 'ꙋ' + # LATIN SMALL LETTER S WITH DOT ABOVE: LATIN SMALL LETTER LONG S WITH DOT ABOVE + 0x1e61: (0x1e9b,), # 'ṡ': 'ẛ' + # LATIN SMALL LETTER LONG S WITH DOT ABOVE: LATIN SMALL LETTER S WITH DOT ABOVE + 0x1e9b: (0x1e61,), # 'ẛ': 'ṡ' + # GREEK PROSGEGRAMMENI: COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA + 0x1fbe: (0x0345, 0x03b9), # 'ι': '\u0345ι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + 0x1fd3: (0x0390,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + 0x1fe3: (0x03b0,), # 'ΰ': 'ΰ' + # CYRILLIC SMALL LETTER MONOGRAPH UK: CYRILLIC SMALL LETTER UNBLENDED UK + 0xa64b: (0x1c88,), # 'ꙋ': 'ᲈ' + # LATIN SMALL LIGATURE LONG S T: LATIN SMALL LIGATURE ST + 0xfb05: (0xfb06,), # 'ſt': 'st' + # LATIN SMALL LIGATURE ST: LATIN SMALL LIGATURE LONG S T + 0xfb06: (0xfb05,), # 'st': 'ſt' +} diff --git a/crates/weavepy-vm/src/stdlib/python/re_compiler.py b/crates/weavepy-vm/src/stdlib/python/re_compiler.py new file mode 100644 index 0000000..c26e999 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_compiler.py @@ -0,0 +1,775 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert template to internal format +# +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +import _sre +from . import _parser +from ._constants import * +from ._casefix import _EXTRA_CASES + +assert _sre.MAGIC == MAGIC, "SRE module mismatch" + +_LITERAL_CODES = {LITERAL, NOT_LITERAL} +_SUCCESS_CODES = {SUCCESS, FAILURE} +_ASSERT_CODES = {ASSERT, ASSERT_NOT} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN} + +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + +def _combine_flags(flags, add_flags, del_flags, + TYPE_FLAGS=_parser.TYPE_FLAGS): + if add_flags & TYPE_FLAGS: + flags &= ~TYPE_FLAGS + return (flags | add_flags) & ~del_flags + +def _compile(code, pattern, flags): + # internal: compile a (sub)pattern + emit = code.append + _len = len + LITERAL_CODES = _LITERAL_CODES + REPEATING_CODES = _REPEATING_CODES + SUCCESS_CODES = _SUCCESS_CODES + ASSERT_CODES = _ASSERT_CODES + iscased = None + tolower = None + fixes = None + if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: + if flags & SRE_FLAG_UNICODE: + iscased = _sre.unicode_iscased + tolower = _sre.unicode_tolower + fixes = _EXTRA_CASES + else: + iscased = _sre.ascii_iscased + tolower = _sre.ascii_tolower + for op, av in pattern: + if op in LITERAL_CODES: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOCALE_IGNORE[op]) + emit(av) + elif not iscased(av): + emit(op) + emit(av) + else: + lo = tolower(av) + if not fixes: # ascii + emit(OP_IGNORE[op]) + emit(lo) + elif lo not in fixes: + emit(OP_UNICODE_IGNORE[op]) + emit(lo) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + if op is NOT_LITERAL: + emit(NEGATE) + for k in (lo,) + fixes[lo]: + emit(LITERAL) + emit(k) + emit(FAILURE) + code[skip] = _len(code) - skip + elif op is IN: + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif not hascased: + emit(IN) + elif not fixes: # ascii + emit(IN_IGNORE) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + _compile_charset(charset, flags, code) + code[skip] = _len(code) - skip + elif op is ANY: + if flags & SRE_FLAG_DOTALL: + emit(ANY_ALL) + else: + emit(ANY) + elif op in REPEATING_CODES: + if _simple(av[2]): + emit(REPEATING_CODES[op][2]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + else: + emit(REPEATING_CODES[op][0]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + emit(REPEATING_CODES[op][1]) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group: + emit(MARK) + emit((group-1)*2) + # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) + if group: + emit(MARK) + emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op in SUCCESS_CODES: + emit(op) + elif op in ASSERT_CODES: + emit(op) + skip = _len(code); emit(0) + if av[0] >= 0: + emit(0) # look ahead + else: + lo, hi = av[1].getwidth() + if lo > MAXCODE: + raise error("looks too much behind") + if lo != hi: + raise PatternError("look-behind requires fixed-width pattern") + emit(lo) # look behind + _compile(code, av[1], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op is AT: + emit(op) + if flags & SRE_FLAG_MULTILINE: + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(av) + elif op is BRANCH: + emit(op) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = _len(code); emit(0) + # _compile_info(code, av, flags) + _compile(code, av, flags) + emit(JUMP) + tailappend(_len(code)); emit(0) + code[skip] = _len(code) - skip + emit(FAILURE) # end of branch + for tail in tail: + code[tail] = _len(code) - tail + elif op is CATEGORY: + emit(op) + if flags & SRE_FLAG_LOCALE: + av = CH_LOCALE[av] + elif flags & SRE_FLAG_UNICODE: + av = CH_UNICODE[av] + emit(av) + elif op is GROUPREF: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + elif flags & SRE_FLAG_LOCALE: + emit(GROUPREF_LOC_IGNORE) + elif not fixes: # ascii + emit(GROUPREF_IGNORE) + else: + emit(GROUPREF_UNI_IGNORE) + emit(av-1) + elif op is GROUPREF_EXISTS: + emit(op) + emit(av[0]-1) + skipyes = _len(code); emit(0) + _compile(code, av[1], flags) + if av[2]: + emit(JUMP) + skipno = _len(code); emit(0) + code[skipyes] = _len(code) - skipyes + 1 + _compile(code, av[2], flags) + code[skipno] = _len(code) - skipno + else: + code[skipyes] = _len(code) - skipyes + 1 + else: + raise PatternError(f"internal: unsupported operand type {op!r}") + +def _compile_charset(charset, flags, code): + # compile charset subprogram + emit = code.append + for op, av in charset: + emit(op) + if op is NEGATE: + pass + elif op is LITERAL: + emit(av) + elif op is RANGE or op is RANGE_UNI_IGNORE: + emit(av[0]) + emit(av[1]) + elif op is CHARSET: + code.extend(av) + elif op is BIGCHARSET: + code.extend(av) + elif op is CATEGORY: + if flags & SRE_FLAG_LOCALE: + emit(CH_LOCALE[av]) + elif flags & SRE_FLAG_UNICODE: + emit(CH_UNICODE[av]) + else: + emit(av) + else: + raise PatternError(f"internal: unsupported set operator {op!r}") + emit(FAILURE) + +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): + # internal: optimize character set + out = [] + tail = [] + charmap = bytearray(256) + hascased = False + for op, av in charset: + while True: + try: + if op is LITERAL: + if fixup: # IGNORECASE and not LOCALE + av = fixup(av) + charmap[av] = 1 + if fixes and av in fixes: + for k in fixes[av]: + charmap[k] = 1 + if not hascased and iscased(av): + hascased = True + else: + charmap[av] = 1 + elif op is RANGE: + r = range(av[0], av[1]+1) + if fixup: # IGNORECASE and not LOCALE + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) + else: + for i in r: + charmap[i] = 1 + elif op is NEGATE: + out.append((op, av)) + else: + tail.append((op, av)) + except IndexError: + if len(charmap) == 256: + # character set contains non-UCS1 character codes + charmap += b'\0' * 0xff00 + continue + # Character set contains non-BMP character codes. + # For range, all BMP characters in the range are already + # proceeded. + if fixup: # IGNORECASE and not LOCALE + # For now, IN_UNI_IGNORE+LITERAL and + # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP + # characters, because two characters (at least one of + # which is not in the BMP) match case-insensitively + # if and only if: + # 1) c1.lower() == c2.lower() + # 2) c1.lower() == c2 or c1.lower().upper() == c2 + # Also, both c.lower() and c.lower().upper() are single + # characters for every non-BMP character. + if op is RANGE: + if fixes: # not ASCII + op = RANGE_UNI_IGNORE + hascased = True + else: + assert op is LITERAL + if not hascased and iscased(av): + hascased = True + tail.append((op, av)) + break + + # compress character map + runs = [] + q = 0 + while True: + p = charmap.find(1, q) + if p < 0: + break + if len(runs) >= 2: + runs = None + break + q = charmap.find(0, p) + if q < 0: + runs.append((p, len(charmap))) + break + runs.append((p, q)) + if runs is not None: + # use literal/range + for p, q in runs: + if q - p == 1: + out.append((LITERAL, p)) + else: + out.append((RANGE, (p, q - 1))) + out += tail + # if the case was changed or new representation is more compact + if hascased or len(out) < len(charset): + return out, hascased + # else original character set is good enough + return charset, hascased + + # use bitmap + if len(charmap) == 256: + data = _mk_bitmap(charmap) + out.append((CHARSET, data)) + out += tail + return out, hascased + + # To represent a big charset, first a bitmap of all characters in the + # set is constructed. Then, this bitmap is sliced into chunks of 256 + # characters, duplicate chunks are eliminated, and each chunk is + # given a number. In the compiled expression, the charset is + # represented by a 32-bit word sequence, consisting of one word for + # the number of different chunks, a sequence of 256 bytes (64 words) + # of chunk numbers indexed by their original chunk position, and a + # sequence of 256-bit chunks (8 words each). + + # Compression is normally good: in a typical charset, large ranges of + # Unicode will be either completely excluded (e.g. if only cyrillic + # letters are to be matched), or completely included (e.g. if large + # subranges of Kanji match). These ranges will be represented by + # chunks of all one-bits or all zero-bits. + + # Matching can be also done efficiently: the more significant byte of + # the Unicode character is an index into the chunk number, and the + # less significant byte is a bit index in the chunk (just like the + # CHARSET matching). + + charmap = bytes(charmap) # should be hashable + comps = {} + mapping = bytearray(256) + block = 0 + data = bytearray() + for i in range(0, 65536, 256): + chunk = charmap[i: i + 256] + if chunk in comps: + mapping[i // 256] = comps[chunk] + else: + mapping[i // 256] = comps[chunk] = block + block += 1 + data += chunk + data = _mk_bitmap(data) + data[0:0] = [block] + _bytes_to_codes(mapping) + out.append((BIGCHARSET, data)) + out += tail + return out, hascased + +_CODEBITS = _sre.CODESIZE * 8 +MAXCODE = (1 << _CODEBITS) - 1 +_BITS_TRANS = b'0' + b'1' * 255 +def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): + s = bits.translate(_BITS_TRANS)[::-1] + return [_int(s[i - _CODEBITS: i], 2) + for i in range(len(s), 0, -_CODEBITS)] + +def _bytes_to_codes(b): + # Convert block indices to word array. CPython uses + # ``memoryview(b).cast('I')``; we decode CODESIZE-byte little/native + # words directly so we don't depend on memoryview.cast(). + import sys + cs = _sre.CODESIZE + assert len(b) % cs == 0 + return [int.from_bytes(bytes(b[i:i + cs]), sys.byteorder) + for i in range(0, len(b), cs)] + +def _simple(p): + # check if this subpattern is a "simple" operator + if len(p) != 1: + return False + op, av = p[0] + if op is SUBPATTERN: + return av[0] is None and _simple(av[-1]) + return op in _UNIT_CODES + +def _generate_overlap_table(prefix): + """ + Generate an overlap table for the following prefix. + An overlap table is a table of the same size as the prefix which + informs about the potential self-overlap for each index in the prefix: + - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] + - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with + prefix[0:k] + """ + table = [0] * len(prefix) + for i in range(1, len(prefix)): + idx = table[i - 1] + while prefix[i] != prefix[idx]: + if idx == 0: + table[i] = 0 + break + idx = table[idx - 1] + else: + table[i] = idx + 1 + return table + +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): + # look for literal prefix + prefix = [] + prefixappend = prefix.append + prefix_skip = None + iscased = _get_iscased(flags) + for op, av in pattern.data: + if op is LITERAL: + if iscased and iscased(av): + break + prefixappend(av) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + flags1 = _combine_flags(flags, add_flags, del_flags) + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: + break + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) + if prefix_skip is None: + if group is not None: + prefix_skip = len(prefix) + elif prefix_skip1 is not None: + prefix_skip = len(prefix) + prefix_skip1 + prefix.extend(prefix1) + if not got_all: + break + else: + break + else: + return prefix, prefix_skip, True + return prefix, prefix_skip, False + +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None + op, av = pattern.data[0] + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = _combine_flags(flags, add_flags, del_flags) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) + else: + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE + if lo == 0: + code.extend([INFO, 4, 0, lo, hi]) + return + # look for a literal prefix + prefix = [] + prefix_skip = 0 + charset = [] # not used + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): + # look for literal prefix + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) + # if no prefix, look for charset prefix + if not prefix: + charset = _get_charset_prefix(pattern, flags) +## if prefix: +## print("*** PREFIX", prefix, prefix_skip) +## if charset: +## print("*** CHARSET", charset) + # add an info block + emit = code.append + emit(INFO) + skip = len(code); emit(0) + # literal flag + mask = 0 + if prefix: + mask = SRE_INFO_PREFIX + if prefix_skip is None and got_all: + mask = mask | SRE_INFO_LITERAL + elif charset: + mask = mask | SRE_INFO_CHARSET + emit(mask) + # pattern length + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + emit(hi) + # add literal prefix + if prefix: + emit(len(prefix)) # length + if prefix_skip is None: + prefix_skip = len(prefix) + emit(prefix_skip) # skip + code.extend(prefix) + # generate overlap table + code.extend(_generate_overlap_table(prefix)) + elif charset: + charset, hascased = _optimize_charset(charset) + assert not hascased + _compile_charset(charset, flags, code) + code[skip] = len(code) - skip + +def isstring(obj): + return isinstance(obj, (str, bytes)) + +def _code(p, flags): + + flags = p.state.flags | flags + code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern + _compile(code, p.data, flags) + + code.append(SUCCESS) + + return code + +def _hex_code(code): + return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) + +def dis(code): + import sys + + labels = set() + level = 0 + offset_width = len(str(len(code) - 1)) + + def dis_(start, end): + def print_(*args, to=None): + if to is not None: + labels.add(to) + args += ('(to %d)' % (to,),) + print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), + end=' '*(level-1)) + print(*args) + + def print_2(*args): + print(end=' '*(offset_width + 2*level)) + print(*args) + + nonlocal level + level += 1 + i = start + while i < end: + start = i + op = code[i] + i += 1 + op = OPCODES[op] + if op in (SUCCESS, FAILURE, ANY, ANY_ALL, + MAX_UNTIL, MIN_UNTIL, NEGATE): + print_(op) + elif op in (LITERAL, NOT_LITERAL, + LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, + LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, '%#02x (%r)' % (arg, chr(arg))) + elif op is AT: + arg = code[i] + i += 1 + arg = str(ATCODES[arg]) + assert arg[:3] == 'AT_' + print_(op, arg[3:]) + elif op is CATEGORY: + arg = code[i] + i += 1 + arg = str(CHCODES[arg]) + assert arg[:9] == 'CATEGORY_' + print_(op, arg[9:]) + elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op in (RANGE, RANGE_UNI_IGNORE): + lo, hi = code[i: i+2] + i += 2 + print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) + elif op is CHARSET: + print_(op, _hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + elif op is BIGCHARSET: + arg = code[i] + i += 1 + mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) + for x in code[i: i + 256//_sre.CODESIZE])) + print_(op, arg, mapping) + i += 256//_sre.CODESIZE + level += 1 + for j in range(arg): + print_2(_hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + level -= 1 + elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, + GROUPREF_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, arg) + elif op is JUMP: + skip = code[i] + print_(op, skip, to=i+skip) + i += 1 + elif op is BRANCH: + skip = code[i] + print_(op, skip, to=i+skip) + while skip: + dis_(i+1, i+skip) + i += skip + start = i + skip = code[i] + if skip: + print_('branch', skip, to=i+skip) + else: + print_(FAILURE) + i += 1 + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): + skip, min, max = code[i: i+3] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, to=i+skip) + dis_(i+3, i+skip) + i += skip + elif op is GROUPREF_EXISTS: + arg, skip = code[i: i+2] + print_(op, arg, skip, to=i+skip) + i += 2 + elif op in (ASSERT, ASSERT_NOT): + skip, arg = code[i: i+2] + print_(op, skip, arg, to=i+skip) + dis_(i+2, i+skip) + i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op is INFO: + skip, flags, min, max = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, bin(flags), min, max, to=i+skip) + start = i+4 + if flags & SRE_INFO_PREFIX: + prefix_len, prefix_skip = code[i+4: i+6] + print_2(' prefix_skip', prefix_skip) + start = i + 6 + prefix = code[start: start+prefix_len] + print_2(' prefix', + '[%s]' % ', '.join('%#02x' % x for x in prefix), + '(%r)' % ''.join(map(chr, prefix))) + start += prefix_len + print_2(' overlap', code[start: start+prefix_len]) + start += prefix_len + if flags & SRE_INFO_CHARSET: + level += 1 + print_2('in') + dis_(start, i+skip) + level -= 1 + i += skip + else: + raise ValueError(op) + + level -= 1 + + dis_(0, len(code)) + + +def compile(p, flags=0): + # internal: convert pattern list to internal format + + if isstring(p): + pattern = p + p = _parser.parse(p, flags) + else: + pattern = None + + code = _code(p, flags) + + if flags & SRE_FLAG_DEBUG: + print() + dis(code) + + # map in either direction + groupindex = p.state.groupdict + indexgroup = [None] * p.state.groups + for k, i in groupindex.items(): + indexgroup[i] = k + + # WeavePy: the matching engine core (_sre) is a pure-data backtracker; + # the user-visible Pattern / Match objects live in the frozen + # re._engine module (so callable re.sub, scanner, etc. stay in Python). + from . import _engine + return _engine.compile_pattern( + pattern, flags | p.state.flags, code, + p.state.groups-1, + groupindex, tuple(indexgroup) + ) diff --git a/crates/weavepy-vm/src/stdlib/python/re_constants.py b/crates/weavepy-vm/src/stdlib/python/re_constants.py new file mode 100644 index 0000000..9c3c294 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_constants.py @@ -0,0 +1,222 @@ +# +# Secret Labs' Regular Expression Engine +# +# various symbols used by the regular expression engine. +# run this script to update the _sre include files! +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# update when constants are added or removed + +MAGIC = 20230612 + +from _sre import MAXREPEAT, MAXGROUPS + +# SRE standard exception (access as sre.error) +# should this really be here? + +class PatternError(Exception): + """Exception raised for invalid regular expressions. + + Attributes: + + msg: The unformatted error message + pattern: The regular expression pattern + pos: The index in the pattern where compilation failed (may be None) + lineno: The line corresponding to pos (may be None) + colno: The column corresponding to pos (may be None) + """ + + __module__ = 're' + + def __init__(self, msg, pattern=None, pos=None): + self.msg = msg + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + msg = '%s at position %d' % (msg, pos) + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + if newline in pattern: + msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) + else: + self.lineno = self.colno = None + super().__init__(msg) + + +# Backward compatibility after renaming in 3.13 +error = PatternError + +class _NamedIntConstant(int): + def __new__(cls, value, name): + self = super(_NamedIntConstant, cls).__new__(cls, value) + self.name = name + return self + + def __repr__(self): + return self.name + + __reduce__ = None + +MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') + +def _makecodes(*names): + items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] + globals().update({item.name: item for item in items}) + return items + +# operators +OPCODES = _makecodes( + # failure=0 success=1 (just because it looks better that way :-) + 'FAILURE', 'SUCCESS', + + 'ANY', 'ANY_ALL', + 'ASSERT', 'ASSERT_NOT', + 'AT', + 'BRANCH', + 'CATEGORY', + 'CHARSET', 'BIGCHARSET', + 'GROUPREF', 'GROUPREF_EXISTS', + 'IN', + 'INFO', + 'JUMP', + 'LITERAL', + 'MARK', + 'MAX_UNTIL', + 'MIN_UNTIL', + 'NOT_LITERAL', + 'NEGATE', + 'RANGE', + 'REPEAT', + 'REPEAT_ONE', + 'SUBPATTERN', + 'MIN_REPEAT_ONE', + 'ATOMIC_GROUP', + 'POSSESSIVE_REPEAT', + 'POSSESSIVE_REPEAT_ONE', + + 'GROUPREF_IGNORE', + 'IN_IGNORE', + 'LITERAL_IGNORE', + 'NOT_LITERAL_IGNORE', + + 'GROUPREF_LOC_IGNORE', + 'IN_LOC_IGNORE', + 'LITERAL_LOC_IGNORE', + 'NOT_LITERAL_LOC_IGNORE', + + 'GROUPREF_UNI_IGNORE', + 'IN_UNI_IGNORE', + 'LITERAL_UNI_IGNORE', + 'NOT_LITERAL_UNI_IGNORE', + 'RANGE_UNI_IGNORE', + + # The following opcodes are only occurred in the parser output, + # but not in the compiled code. + 'MIN_REPEAT', 'MAX_REPEAT', +) +del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT + +# positions +ATCODES = _makecodes( + 'AT_BEGINNING', 'AT_BEGINNING_LINE', 'AT_BEGINNING_STRING', + 'AT_BOUNDARY', 'AT_NON_BOUNDARY', + 'AT_END', 'AT_END_LINE', 'AT_END_STRING', + + 'AT_LOC_BOUNDARY', 'AT_LOC_NON_BOUNDARY', + + 'AT_UNI_BOUNDARY', 'AT_UNI_NON_BOUNDARY', +) + +# categories +CHCODES = _makecodes( + 'CATEGORY_DIGIT', 'CATEGORY_NOT_DIGIT', + 'CATEGORY_SPACE', 'CATEGORY_NOT_SPACE', + 'CATEGORY_WORD', 'CATEGORY_NOT_WORD', + 'CATEGORY_LINEBREAK', 'CATEGORY_NOT_LINEBREAK', + + 'CATEGORY_LOC_WORD', 'CATEGORY_LOC_NOT_WORD', + + 'CATEGORY_UNI_DIGIT', 'CATEGORY_UNI_NOT_DIGIT', + 'CATEGORY_UNI_SPACE', 'CATEGORY_UNI_NOT_SPACE', + 'CATEGORY_UNI_WORD', 'CATEGORY_UNI_NOT_WORD', + 'CATEGORY_UNI_LINEBREAK', 'CATEGORY_UNI_NOT_LINEBREAK', +) + + +# replacement operations for "ignore case" mode +OP_IGNORE = { + LITERAL: LITERAL_IGNORE, + NOT_LITERAL: NOT_LITERAL_IGNORE, +} + +OP_LOCALE_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + +AT_MULTILINE = { + AT_BEGINNING: AT_BEGINNING_LINE, + AT_END: AT_END_LINE +} + +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + +CH_LOCALE = { + CATEGORY_DIGIT: CATEGORY_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, + CATEGORY_WORD: CATEGORY_LOC_WORD, + CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK +} + +CH_UNICODE = { + CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_UNI_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, + CATEGORY_WORD: CATEGORY_UNI_WORD, + CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK +} + +# flags +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode "locale" +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set diff --git a/crates/weavepy-vm/src/stdlib/python/re_engine.py b/crates/weavepy-vm/src/stdlib/python/re_engine.py new file mode 100644 index 0000000..de9c8eb --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_engine.py @@ -0,0 +1,426 @@ +# +# WeavePy: user-visible Pattern / Match objects for the re package. +# +# CPython implements `re.Pattern` and `re.Match` as C types inside the +# `_sre` extension. WeavePy instead keeps `_sre` as a pure-data +# backtracking core (compile + exec returning group spans) and builds +# the user-facing objects here, in Python. Doing so means callable +# `re.sub` replacements, `finditer`, `Scanner`, etc. all run on the +# normal interpreter without the engine ever re-entering the VM. +# +# Behaviour (group semantics, greedy/lazy scanning, empty-match +# handling, split/sub/subn rules) follows CPython 3.13 exactly. + +import _sre +from . import _parser +from ._constants import error as PatternError + +__all__ = ["Pattern", "Match", "compile_pattern"] + +# exec() modes understood by the native core. +_MODE_SEARCH = 0 +_MODE_MATCH = 1 +_MODE_FULLMATCH = 2 + +# Cache of parsed replacement templates, keyed by (pattern handle, repl). +# Cleared by re.purge(). +_template_cache = {} + + +def compile_pattern(pattern, flags, code, groups, groupindex, indexgroup): + """Build a Pattern. Called by re._compiler.compile().""" + handle = _sre.compile(code, groups) + return Pattern(handle, pattern, flags, groups, groupindex, indexgroup) + + +def _clamp_span(string, pos, endpos): + length = len(string) + if pos is None: + pos = 0 + if endpos is None: + endpos = length + if pos < 0: + pos = 0 + elif pos > length: + pos = length + if endpos > length: + endpos = length + elif endpos < 0: + endpos = 0 + return pos, endpos + + +class Pattern: + __module__ = 're' + + def __init__(self, handle, pattern, flags, groups, groupindex, indexgroup): + self._handle = handle + self.pattern = pattern + self.flags = flags + self.groups = groups + self.groupindex = groupindex + # tuple: group number (1-based) -> name or None + self._indexgroup = indexgroup + + # -- internal -------------------------------------------------------- + + def _exec(self, string, pos, endpos, mode, must_advance): + return _sre.exec(self._handle, string, pos, endpos, mode, + 1 if must_advance else 0) + + def _iter(self, string, pos, endpos): + pos, endpos = _clamp_span(string, pos, endpos) + must_advance = False + opos, oendpos = pos, endpos + while pos <= endpos: + r = self._exec(string, pos, endpos, _MODE_SEARCH, must_advance) + if r is None: + break + start, end = r[0], r[1] + yield Match(self, string, opos, oendpos, r) + must_advance = start == end + pos = end + + # -- public matching API -------------------------------------------- + + def match(self, string, pos=0, endpos=None): + p, e = _clamp_span(string, pos, endpos) + r = self._exec(string, p, e, _MODE_MATCH, False) + if r is None: + return None + return Match(self, string, p, e, r) + + def fullmatch(self, string, pos=0, endpos=None): + p, e = _clamp_span(string, pos, endpos) + r = self._exec(string, p, e, _MODE_FULLMATCH, False) + if r is None: + return None + return Match(self, string, p, e, r) + + def search(self, string, pos=0, endpos=None): + p, e = _clamp_span(string, pos, endpos) + r = self._exec(string, p, e, _MODE_SEARCH, False) + if r is None: + return None + return Match(self, string, p, e, r) + + def findall(self, string, pos=0, endpos=None): + g = self.groups + empty = string[:0] + out = [] + for m in self._iter(string, pos, endpos): + if g == 0: + out.append(m.group(0)) + elif g == 1: + v = m.group(1) + out.append(v if v is not None else empty) + else: + row = [] + for i in range(1, g + 1): + v = m.group(i) + row.append(v if v is not None else empty) + out.append(tuple(row)) + return out + + def finditer(self, string, pos=0, endpos=None): + return self._iter(string, pos, endpos) + + def sub(self, repl, string, count=0): + return self._subx(repl, string, count)[0] + + def subn(self, repl, string, count=0): + return self._subx(repl, string, count) + + def _subx(self, repl, string, count): + if count < 0: + count = 0 + empty = string[:0] + if callable(repl): + filt = repl + else: + template = _compile_template(self, repl) + if len(template) == 1 and not isinstance(template[0], int): + # pure literal replacement + literal = template[0] + filt = lambda m, _l=literal: _l + else: + filt = lambda m, _t=template: _expand_template(_t, m) + out = [] + n = 0 + last = 0 + pos = 0 + endpos = len(string) + must_advance = False + while pos <= endpos: + if count and n >= count: + break + r = self._exec(string, pos, endpos, _MODE_SEARCH, must_advance) + if r is None: + break + start, end = r[0], r[1] + out.append(string[last:start]) + m = Match(self, string, 0, endpos, r) + out.append(filt(m)) + last = end + n += 1 + must_advance = start == end + pos = end + out.append(string[last:]) + return empty.join(out), n + + def split(self, string, maxsplit=0): + if maxsplit < 0: + return [string] + g = self.groups + out = [] + n = 0 + last = 0 + pos = 0 + endpos = len(string) + must_advance = False + while pos <= endpos: + if maxsplit and n >= maxsplit: + break + r = self._exec(string, pos, endpos, _MODE_SEARCH, must_advance) + if r is None: + break + start, end = r[0], r[1] + m = Match(self, string, 0, endpos, r) + out.append(string[last:start]) + for i in range(1, g + 1): + out.append(m.group(i)) + last = end + n += 1 + must_advance = start == end + pos = end + out.append(string[last:]) + return out + + def scanner(self, string, pos=0, endpos=None): + return _Scanner(self, string, pos, endpos) + + # -- misc ------------------------------------------------------------ + + def __repr__(self): + s = repr(self.pattern) + if len(s) > 200: + s = s[:200] + # Hide the implicit UNICODE flag (32) the way CPython does. + flags = self.flags & ~32 + if flags: + return "re.compile(%s, %s)" % (s, _flags_repr(self.flags)) + return "re.compile(%s)" % s + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + @property + def groupindex_proxy(self): + return self.groupindex + + +# Bit -> name table for Pattern repr (matches CPython's RegexFlag names). +_FLAG_NAMES = [ + (256, 're.ASCII'), + (2, 're.IGNORECASE'), + (4, 're.LOCALE'), + (8, 're.MULTILINE'), + (16, 're.DOTALL'), + (64, 're.VERBOSE'), + (128, 're.DEBUG'), +] + + +def _flags_repr(flags): + # Hide the implicit UNICODE flag (32) the way CPython does. + flags &= ~32 + parts = [] + for bit, name in _FLAG_NAMES: + if flags & bit: + parts.append(name) + flags &= ~bit + if flags: + parts.append(hex(flags)) + if not parts: + return '0' + return '|'.join(parts) + + +class Match: + __module__ = 're' + + def __init__(self, pattern, string, pos, endpos, r): + self.re = pattern + self.string = string + self.pos = pos + self.endpos = endpos + self._start = r[0] + self._end = r[1] + self._lastindex_raw = r[2] + self._marks = r[3] + + # -- group span helpers --------------------------------------------- + + def _span_of(self, idx): + if idx == 0: + return (self._start, self._end) + i = (idx - 1) * 2 + return (self._marks[i], self._marks[i + 1]) + + def _index(self, group): + if isinstance(group, int) or (not isinstance(group, str) and hasattr(group, '__index__')): + idx = int(group) + else: + try: + idx = self.re.groupindex[group] + except KeyError: + raise IndexError("no such group") from None + if not 0 <= idx <= self.re.groups: + raise IndexError("no such group") + return idx + + def _getslice(self, idx, default): + s, e = self._span_of(idx) + if s < 0 or e < 0: + return default + return self.string[s:e] + + # -- public API ------------------------------------------------------ + + def group(self, *args): + if not args: + return self._getslice(0, None) + if len(args) == 1: + return self._getslice(self._index(args[0]), None) + return tuple(self._getslice(self._index(g), None) for g in args) + + def __getitem__(self, group): + return self._getslice(self._index(group), None) + + def groups(self, default=None): + return tuple(self._getslice(i, default) + for i in range(1, self.re.groups + 1)) + + def groupdict(self, default=None): + result = {} + for name, idx in self.re.groupindex.items(): + result[name] = self._getslice(idx, default) + return result + + def start(self, group=0): + return self._span_of(self._index(group))[0] + + def end(self, group=0): + return self._span_of(self._index(group))[1] + + def span(self, group=0): + return self._span_of(self._index(group)) + + @property + def regs(self): + spans = [(self._start, self._end)] + for i in range(1, self.re.groups + 1): + spans.append(self._span_of(i)) + return tuple(spans) + + @property + def lastindex(self): + li = self._lastindex_raw + return None if li < 0 else li + + @property + def lastgroup(self): + li = self.lastindex + if li is None: + return None + try: + return self.re._indexgroup[li] + except (IndexError, TypeError): + return None + + def expand(self, template): + return _expand_template(_parse_template(self.re, template), self) + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + def __repr__(self): + text = self.string[self._start:self._end] + return "" % ( + self._start, self._end, text) + + +class _Scanner: + def __init__(self, pattern, string, pos, endpos): + self.pattern = pattern + self._string = string + self._pos, self._endpos = _clamp_span(string, pos, endpos) + self._opos = self._pos + self._oendpos = self._endpos + self._must_advance = False + + def match(self): + return self._run(_MODE_MATCH) + + def search(self): + return self._run(_MODE_SEARCH) + + def _run(self, mode): + if self._pos > self._endpos: + return None + r = _sre.exec(self.pattern._handle, self._string, self._pos, + self._endpos, mode, 1 if self._must_advance else 0) + if r is None: + if mode == _MODE_MATCH: + return None + return None + start, end = r[0], r[1] + m = Match(self.pattern, self._string, self._opos, self._oendpos, r) + self._must_advance = start == end + self._pos = end + return m + + +# --------------------------------------------------------------------------- +# Replacement-template handling +# --------------------------------------------------------------------------- + +def _parse_template(pattern, repl): + return _parser.parse_template(repl, pattern) + + +def _compile_template(pattern, repl): + key = (pattern._handle, repl) + try: + return _template_cache[key] + except KeyError: + pass + template = _parser.parse_template(repl, pattern) + if len(_template_cache) >= 512: + _template_cache.clear() + _template_cache[key] = template + return template + + +def _expand_template(template, match): + # `template` is the flat list returned by _parser.parse_template: + # literals (str/bytes) interleaved with integer group references. + empty = match.string[:0] + parts = [] + for item in template: + if isinstance(item, int): + g = match.group(item) + parts.append(g if g is not None else empty) + else: + parts.append(item) + return empty.join(parts) + + +def clear_template_cache(): + _template_cache.clear() diff --git a/crates/weavepy-vm/src/stdlib/python/re_init.py b/crates/weavepy-vm/src/stdlib/python/re_init.py new file mode 100644 index 0000000..862400c --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_init.py @@ -0,0 +1,350 @@ +# +# Secret Labs' Regular Expression Engine +# +# re-compatible interface for the sre matching engine +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# + +r"""Support for regular expressions (RE). + +This module provides regular expression matching operations similar to +those found in Perl. It supports both 8-bit and Unicode strings; both +the pattern and the strings being processed can contain null bytes and +characters outside the US ASCII range. + +Regular expressions can contain both special and ordinary characters. +Most ordinary characters, like "A", "a", or "0", are the simplest +regular expressions; they simply match themselves. You can +concatenate ordinary characters, so last matches the string 'last'. + +This module exports the following functions: + match Match a regular expression pattern to the beginning of a string. + fullmatch Match a regular expression pattern to all of a string. + search Search a string for the presence of a pattern. + sub Substitute occurrences of a pattern found in a string. + subn Same as sub, but also return the number of substitutions made. + split Split a string by the occurrences of a pattern. + findall Find all occurrences of a pattern in a string. + finditer Return an iterator yielding a Match object for each match. + compile Compile a pattern into a Pattern object. + purge Clear the regular expression cache. + escape Backslash all non-alphanumerics in a string. + +This module also defines an exception 'PatternError', aliased to 'error' +for backward compatibility. + +""" + +import enum +from . import _compiler, _parser +from . import _engine +import functools +import _sre + + +# public symbols +__all__ = [ + "match", "fullmatch", "search", "sub", "subn", "split", + "findall", "finditer", "compile", "purge", "escape", + "error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U", + "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", + "UNICODE", "NOFLAG", "RegexFlag", "PatternError" +] + +__version__ = "2.2.1" + + +class RegexFlag(enum.IntFlag): + NOFLAG = 0 + ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale" + IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case + LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale + UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale" + MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline + DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline + VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments + # sre extensions (experimental, don't rely on these) + DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation + +globals().update(RegexFlag.__members__) + +# sre exception +PatternError = error = _compiler.PatternError + +# -------------------------------------------------------------------- +# public interface + +def match(pattern, string, flags=0): + """Try to apply the pattern at the start of the string, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).match(string) + +def fullmatch(pattern, string, flags=0): + """Try to apply the pattern to all of the string, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).fullmatch(string) + +def search(pattern, string, flags=0): + """Scan through string looking for a match to the pattern, returning + a Match object, or None if no match was found.""" + return _compile(pattern, flags).search(string) + +class _ZeroSentinel(int): + pass +_zero_sentinel = _ZeroSentinel() + +def sub(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel): + """Return the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in string by the + replacement repl. repl can be either a string or a callable; + if a string, backslash escapes in it are processed. If it is + a callable, it's passed the Match object and must return + a replacement string to be used.""" + if args: + if count is not _zero_sentinel: + raise TypeError("sub() got multiple values for argument 'count'") + count, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("sub() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("sub() takes from 3 to 5 positional arguments " + "but %d were given" % (5 + len(args))) + + import warnings + warnings.warn( + "'count' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + elif count is _zero_sentinel: + count = 0 + if flags is _zero_sentinel: + flags = 0 + + return _compile(pattern, flags).sub(repl, string, count) +sub.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' + +def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel): + """Return a 2-tuple containing (new_string, number). + new_string is the string obtained by replacing the leftmost + non-overlapping occurrences of the pattern in the source + string by the replacement repl. number is the number of + substitutions that were made. repl can be either a string or a + callable; if a string, backslash escapes in it are processed. + If it is a callable, it's passed the Match object and must + return a replacement string to be used.""" + if args: + if count is not _zero_sentinel: + raise TypeError("subn() got multiple values for argument 'count'") + count, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("subn() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("subn() takes from 3 to 5 positional arguments " + "but %d were given" % (5 + len(args))) + + import warnings + warnings.warn( + "'count' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + elif count is _zero_sentinel: + count = 0 + if flags is _zero_sentinel: + flags = 0 + + return _compile(pattern, flags).subn(repl, string, count) +subn.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' + +def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel): + """Split the source string by the occurrences of the pattern, + returning a list containing the resulting substrings. If + capturing parentheses are used in pattern, then the text of all + groups in the pattern are also returned as part of the resulting + list. If maxsplit is nonzero, at most maxsplit splits occur, + and the remainder of the string is returned as the final element + of the list.""" + if args: + if maxsplit is not _zero_sentinel: + raise TypeError("split() got multiple values for argument 'maxsplit'") + maxsplit, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("split() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("split() takes from 2 to 4 positional arguments " + "but %d were given" % (4 + len(args))) + + import warnings + warnings.warn( + "'maxsplit' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + elif maxsplit is _zero_sentinel: + maxsplit = 0 + if flags is _zero_sentinel: + flags = 0 + + return _compile(pattern, flags).split(string, maxsplit) +split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)' + +def findall(pattern, string, flags=0): + """Return a list of all non-overlapping matches in the string. + + If one or more capturing groups are present in the pattern, return + a list of groups; this will be a list of tuples if the pattern + has more than one group. + + Empty matches are included in the result.""" + return _compile(pattern, flags).findall(string) + +def finditer(pattern, string, flags=0): + """Return an iterator over all non-overlapping matches in the + string. For each match, the iterator returns a Match object. + + Empty matches are included in the result.""" + return _compile(pattern, flags).finditer(string) + +def compile(pattern, flags=0): + "Compile a regular expression pattern, returning a Pattern object." + return _compile(pattern, flags) + +def purge(): + "Clear the regular expression caches" + _cache.clear() + _cache2.clear() + _engine.clear_template_cache() + + +# SPECIAL_CHARS +# closing ')', '}' and ']' +# '-' (a range in character set) +# '&', '~', (extended character set operations) +# '#' (comment) and WHITESPACE (ignored) in verbose mode +_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'} + +def escape(pattern): + """ + Escape special characters in a string. + """ + if isinstance(pattern, str): + return pattern.translate(_special_chars_map) + else: + pattern = str(pattern, 'latin1') + return pattern.translate(_special_chars_map).encode('latin1') + +Pattern = type(_compiler.compile('', 0)) +Match = type(_compiler.compile('', 0).match('')) + +# -------------------------------------------------------------------- +# internals + +# Use the fact that dict keeps the insertion order. +# _cache2 uses the simple FIFO policy which has better latency. +# _cache uses the LRU policy which has better hit rate. +_cache = {} # LRU +_cache2 = {} # FIFO +_MAXCACHE = 512 +_MAXCACHE2 = 256 +assert _MAXCACHE2 < _MAXCACHE + +def _compile(pattern, flags): + # internal: compile pattern + if isinstance(flags, RegexFlag): + flags = flags.value + try: + return _cache2[type(pattern), pattern, flags] + except KeyError: + pass + + key = (type(pattern), pattern, flags) + # Item in _cache should be moved to the end if found. + p = _cache.pop(key, None) + if p is None: + if isinstance(pattern, Pattern): + if flags: + raise ValueError( + "cannot process flags argument with a compiled pattern") + return pattern + if not _compiler.isstring(pattern): + raise TypeError("first argument must be string or compiled pattern") + p = _compiler.compile(pattern, flags) + if flags & DEBUG: + return p + if len(_cache) >= _MAXCACHE: + # Drop the least recently used item. + try: + del _cache[next(iter(_cache))] + except (StopIteration, RuntimeError, KeyError): + pass + # Append to the end. + _cache[key] = p + + if len(_cache2) >= _MAXCACHE2: + # Drop the oldest item. + try: + del _cache2[next(iter(_cache2))] + except (StopIteration, RuntimeError, KeyError): + pass + _cache2[key] = p + return p + +# register myself for pickling + +import copyreg + +def _pickle(p): + return _compile, (p.pattern, p.flags) + +copyreg.pickle(Pattern, _pickle, _compile) + +# -------------------------------------------------------------------- +# experimental stuff (see python-dev discussions for details) + +class Scanner: + def __init__(self, lexicon, flags=0): + from ._constants import BRANCH, SUBPATTERN + if isinstance(flags, RegexFlag): + flags = flags.value + self.lexicon = lexicon + # combine phrases into a compound pattern + p = [] + s = _parser.State() + s.flags = flags + for phrase, action in lexicon: + gid = s.opengroup() + p.append(_parser.SubPattern(s, [ + (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), + ])) + s.closegroup(gid, p[-1]) + p = _parser.SubPattern(s, [(BRANCH, (None, p))]) + self.scanner = _compiler.compile(p) + def scan(self, string): + result = [] + append = result.append + match = self.scanner.scanner(string).match + i = 0 + while True: + m = match() + if not m: + break + j = m.end() + if i == j: + break + action = self.lexicon[m.lastindex-1][1] + if callable(action): + self.match = m + action = action(self, m.group()) + if action is not None: + append(action) + i = j + return result, string[i:] diff --git a/crates/weavepy-vm/src/stdlib/python/re_parser.py b/crates/weavepy-vm/src/stdlib/python/re_parser.py new file mode 100644 index 0000000..f3c7793 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/re_parser.py @@ -0,0 +1,1081 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert re-style regular expression to sre pattern +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# XXX: show string offset and offending character for all errors + +from ._constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") +ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) +_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) + +ESCAPES = { + r"\a": (LITERAL, ord("\a")), + r"\b": (LITERAL, ord("\b")), + r"\f": (LITERAL, ord("\f")), + r"\n": (LITERAL, ord("\n")), + r"\r": (LITERAL, ord("\r")), + r"\t": (LITERAL, ord("\t")), + r"\v": (LITERAL, ord("\v")), + r"\\": (LITERAL, ord("\\")) +} + +CATEGORIES = { + r"\A": (AT, AT_BEGINNING_STRING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END_STRING), # end of string +} + +FLAGS = { + # standard flags + "i": SRE_FLAG_IGNORECASE, + "L": SRE_FLAG_LOCALE, + "m": SRE_FLAG_MULTILINE, + "s": SRE_FLAG_DOTALL, + "x": SRE_FLAG_VERBOSE, + # extensions + "a": SRE_FLAG_ASCII, + "u": SRE_FLAG_UNICODE, +} + +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG + +# Maximal value returned by SubPattern.getwidth(). +# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. +MAXWIDTH = 1 << 64 + +class State: + # keeps track of state for parsing + def __init__(self): + self.flags = 0 + self.groupdict = {} + self.groupwidths = [None] # group 0 + self.lookbehindgroups = None + self.grouprefpos = {} + @property + def groups(self): + return len(self.groupwidths) + def opengroup(self, name=None): + gid = self.groups + self.groupwidths.append(None) + if self.groups > MAXGROUPS: + raise error("too many groups") + if name is not None: + ogid = self.groupdict.get(name, None) + if ogid is not None: + raise error("redefinition of group name %r as group %d; " + "was group %d" % (name, gid, ogid)) + self.groupdict[name] = gid + return gid + def closegroup(self, gid, p): + self.groupwidths[gid] = p.getwidth() + def checkgroup(self, gid): + return gid < self.groups and self.groupwidths[gid] is not None + + def checklookbehindgroup(self, gid, source): + if self.lookbehindgroups is not None: + if not self.checkgroup(gid): + raise source.error('cannot refer to an open group') + if gid >= self.lookbehindgroups: + raise source.error('cannot refer to group defined in the same ' + 'lookbehind subpattern') + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, state, data=None): + self.state = state + if data is None: + data = [] + self.data = data + self.width = None + + def dump(self, level=0): + seqtypes = (tuple, list) + for op, av in self.data: + print(level*" " + str(op), end='') + if op is IN: + # member sublanguage + print() + for op, a in av: + print((level+1)*" " + str(op), a) + elif op is BRANCH: + print() + for i, a in enumerate(av[1]): + if i: + print(level*" " + "OR") + a.dump(level+1) + elif op is GROUPREF_EXISTS: + condgroup, item_yes, item_no = av + print('', condgroup) + item_yes.dump(level+1) + if item_no: + print(level*" " + "ELSE") + item_no.dump(level+1) + elif isinstance(av, SubPattern): + print() + av.dump(level+1) + elif isinstance(av, seqtypes): + nl = False + for a in av: + if isinstance(a, SubPattern): + if not nl: + print() + a.dump(level+1) + nl = True + else: + if not nl: + print(' ', end='') + print(a, end='') + nl = False + if not nl: + print() + else: + print('', av) + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + if isinstance(index, slice): + return SubPattern(self.state, self.data[index]) + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width is not None: + return self.width + lo = hi = 0 + for op, av in self.data: + if op is BRANCH: + i = MAXWIDTH + j = 0 + for av in av[1]: + l, h = av.getwidth() + i = min(i, l) + j = max(j, h) + lo = lo + i + hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[-1].getwidth() + lo = lo + i + hi = hi + j + elif op in _REPEATCODES: + i, j = av[2].getwidth() + lo = lo + i * av[0] + if av[1] == MAXREPEAT and j: + hi = MAXWIDTH + else: + hi = hi + j * av[1] + elif op in _UNITCODES: + lo = lo + 1 + hi = hi + 1 + elif op is GROUPREF: + i, j = self.state.groupwidths[av] + lo = lo + i + hi = hi + j + elif op is GROUPREF_EXISTS: + i, j = av[1].getwidth() + if av[2] is not None: + l, h = av[2].getwidth() + i = min(i, l) + j = max(j, h) + else: + i = 0 + lo = lo + i + hi = hi + j + elif op is SUCCESS: + break + self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) + return self.width + +class Tokenizer: + def __init__(self, string): + self.istext = isinstance(string, str) + self.string = string + if not self.istext: + string = str(string, 'latin1') + self.decoded_string = string + self.index = 0 + self.next = None + self.__next() + def __next(self): + index = self.index + try: + char = self.decoded_string[index] + except IndexError: + self.next = None + return + if char == "\\": + index += 1 + try: + char += self.decoded_string[index] + except IndexError: + raise error("bad escape (end of pattern)", + self.string, len(self.string) - 1) from None + self.index = index + 1 + self.next = char + def match(self, char): + if char == self.next: + self.__next() + return True + return False + def get(self): + this = self.next + self.__next() + return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result + def getuntil(self, terminator, name): + result = '' + while True: + c = self.next + self.__next() + if c is None: + if not result: + raise self.error("missing " + name) + raise self.error("missing %s, unterminated name" % terminator, + len(result)) + if c == terminator: + if not result: + raise self.error("missing " + name, 1) + break + result += c + return result + @property + def pos(self): + return self.index - len(self.next or '') + def tell(self): + return self.index - len(self.next or '') + def seek(self, index): + self.index = index + self.__next() + + def error(self, msg, offset=0): + if not self.istext: + msg = msg.encode('ascii', 'backslashreplace').decode('ascii') + return error(msg, self.string, self.tell() - offset) + + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + +def _class_escape(source, escape): + # handle escape code inside character class + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + if code and code[0] is IN: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape (exactly two digits) + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c in OCTDIGITS: + # octal escape (up to three digits) + escape += source.getwhile(2, OCTDIGITS) + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, len(escape)) + return LITERAL, c + elif c in DIGITS: + raise ValueError + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error('bad escape %s' % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _escape(source, escape, state): + # handle escape code in expression + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c == "0": + # octal escape + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) + elif c in DIGITS: + # octal escape *or* decimal group reference (sigh) + if source.next in DIGITS: + escape += source.get() + if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and + source.next in OCTDIGITS): + # got three octal digits; this is an octal escape + escape += source.get() + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, + len(escape)) + return LITERAL, c + # not an octal escape, so this is a group reference + group = int(escape[1:]) + if group < state.groups: + if not state.checkgroup(group): + raise source.error("cannot refer to an open group", + len(escape)) + state.checklookbehindgroup(group, source) + return GROUPREF, group + raise source.error("invalid group reference %d" % group, len(escape) - 1) + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error("bad escape %s" % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _uniq(items): + return list(dict.fromkeys(items)) + +def _parse_sub(source, state, verbose, nested): + # parse an alternation: a|b|c + + items = [] + itemsappend = items.append + sourcematch = source.match + start = source.tell() + while True: + itemsappend(_parse(source, state, verbose, nested + 1, + not nested and not items)) + if not sourcematch("|"): + break + if not nested: + verbose = state.flags & SRE_FLAG_VERBOSE + + if len(items) == 1: + return items[0] + + subpattern = SubPattern(state) + + # check if all items share a common prefix + while True: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + set = [] + for item in items: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: + break + else: + # we can store this as a character set instead of a + # branch (the compiler may optimize this even more) + subpattern.append((IN, _uniq(set))) + return subpattern + + subpattern.append((BRANCH, (None, items))) + return subpattern + +def _parse(source, state, verbose, nested, first=False): + # parse a simple pattern + subpattern = SubPattern(state) + + # precompute constants into local variables + subpatternappend = subpattern.append + sourceget = source.get + sourcematch = source.match + _len = len + _ord = ord + + while True: + + this = source.next + if this is None: + break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() + + if verbose: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while True: + this = sourceget() + if this is None or this == "\n": + break + continue + + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) + + elif this == "[": + here = source.tell() - 1 + # character set + set = [] + setappend = set.append +## if sourcematch(":"): +## pass # handle character classes + if source.next == '[': + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 6 + ) + negate = sourcematch("^") + # check remaining characters + while True: + this = sourceget() + if this is None: + raise source.error("unterminated character set", + source.tell() - here) + if this == "]" and set: + break + elif this[0] == "\\": + code1 = _class_escape(source, this) + else: + if set and this in '-&~|' and source.next == this: + import warnings + warnings.warn( + 'Possible set %s at position %d' % ( + 'difference' if this == '-' else + 'intersection' if this == '&' else + 'symmetric difference' if this == '~' else + 'union', + source.tell() - 1), + FutureWarning, stacklevel=nested + 6 + ) + code1 = LITERAL, _ord(this) + if sourcematch("-"): + # potential range + that = sourceget() + if that is None: + raise source.error("unterminated character set", + source.tell() - here) + if that == "]": + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + setappend((LITERAL, _ord("-"))) + break + if that[0] == "\\": + code2 = _class_escape(source, that) + else: + if that == '-': + import warnings + warnings.warn( + 'Possible set difference at position %d' % ( + source.tell() - 2), + FutureWarning, stacklevel=nested + 6 + ) + code2 = LITERAL, _ord(that) + if code1[0] != LITERAL or code2[0] != LITERAL: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + lo = code1[1] + hi = code2[1] + if hi < lo: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + setappend((RANGE, (lo, hi))) + else: + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + + set = _uniq(set) + # XXX: should move set optimization to compiler! + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) + else: + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known + subpatternappend((IN, set)) + + elif this in REPEAT_CHARS: + # repeat previous item + here = source.tell() + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, MAXREPEAT + + elif this == "+": + min, max = 1, MAXREPEAT + elif this == "{": + if source.next == "}": + subpatternappend((LITERAL, _ord(this))) + continue + + min, max = 0, MAXREPEAT + lo = hi = "" + while source.next in DIGITS: + lo += sourceget() + if sourcematch(","): + while source.next in DIGITS: + hi += sourceget() + else: + hi = lo + if not sourcematch("}"): + subpatternappend((LITERAL, _ord(this))) + source.seek(here) + continue + + if lo: + min = int(lo) + if min >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if hi: + max = int(hi) + if max >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if max < min: + raise source.error("min repeat greater than max repeat", + source.tell() - here) + else: + raise AssertionError("unsupported quantifier %r" % (char,)) + # figure out which item to repeat + if subpattern: + item = subpattern[-1:] + else: + item = None + if not item or item[0][0] is AT: + raise source.error("nothing to repeat", + source.tell() - here + len(this)) + if item[0][0] in _REPEATCODES: + raise source.error("multiple repeat", + source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p + if sourcematch("?"): + # Non-Greedy Match + subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) + else: + # Greedy Match + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + + elif this == ".": + subpatternappend((ANY, None)) + + elif this == "(": + start = source.tell() - 1 + capture = True + atomic = False + name = None + add_flags = 0 + del_flags = 0 + if sourcematch("?"): + # options + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char == "P": + # python extensions + if sourcematch("<"): + # named group: skip forward to end of name + name = source.getuntil(">", "group name") + source.checkgroupname(name, 1) + elif sourcematch("="): + # named backreference + name = source.getuntil(")", "group name") + source.checkgroupname(name, 1) + gid = state.groupdict.get(name) + if gid is None: + msg = "unknown group name %r" % name + raise source.error(msg, len(name) + 1) + if not state.checkgroup(gid): + raise source.error("cannot refer to an open group", + len(name) + 1) + state.checklookbehindgroup(gid, source) + subpatternappend((GROUPREF, gid)) + continue + + else: + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + raise source.error("unknown extension ?P" + char, + len(char) + 2) + elif char == ":": + # non-capturing group + capture = False + elif char == "#": + # comment + while True: + if source.next is None: + raise source.error("missing ), unterminated comment", + source.tell() - start) + if sourceget() == ")": + break + continue + + elif char in "=!<": + # lookahead assertions + dir = 1 + if char == "<": + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char not in "=!": + raise source.error("unknown extension ?<" + char, + len(char) + 2) + dir = -1 # lookbehind + lookbehindgroups = state.lookbehindgroups + if lookbehindgroups is None: + state.lookbehindgroups = state.groups + p = _parse_sub(source, state, verbose, nested + 1) + if dir < 0: + if lookbehindgroups is None: + state.lookbehindgroups = None + if not sourcematch(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if char == "=": + subpatternappend((ASSERT, (dir, p))) + elif p: + subpatternappend((ASSERT_NOT, (dir, p))) + else: + subpatternappend((FAILURE, ())) + continue + + elif char == "(": + # conditional backreference group + condname = source.getuntil(")", "group name") + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: + condgroup = int(condname) + if not condgroup: + raise source.error("bad group number", + len(condname) + 1) + if condgroup >= MAXGROUPS: + msg = "invalid group reference %d" % condgroup + raise source.error(msg, len(condname) + 1) + if condgroup not in state.grouprefpos: + state.grouprefpos[condgroup] = ( + source.tell() - len(condname) - 1 + ) + if not (condname.isdecimal() and condname.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(condname) if source.istext else ascii(condname), + source.tell() - len(condname) - 1), + DeprecationWarning, stacklevel=nested + 6 + ) + state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose, nested + 1) + if source.match("|"): + item_no = _parse(source, state, verbose, nested + 1) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True + elif char in FLAGS or char == "-": + # flags + flags = _parse_flags(source, state, char) + if flags is None: # global flags + if not first or subpattern: + raise source.error('global flags not at the start ' + 'of the expression', + source.tell() - start) + verbose = state.flags & SRE_FLAG_VERBOSE + continue + + add_flags, del_flags = flags + capture = False + else: + raise source.error("unknown extension ?" + char, + len(char) + 1) + + # parse group contents + if capture: + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) from None + else: + group = None + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose, nested + 1) + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if group is not None: + state.closegroup(group, p) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + + elif this == "^": + subpatternappend((AT, AT_BEGINNING)) + + elif this == "$": + subpatternappend((AT, AT_END)) + + else: + raise AssertionError("unsupported special character %r" % (char,)) + + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + + return subpattern + +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if flags & SRE_FLAG_LOCALE: + raise ValueError("cannot use LOCALE flag with a str pattern") + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("cannot use UNICODE flag with a bytes pattern") + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: + raise ValueError("ASCII and LOCALE flags are incompatible") + return flags + +def parse(str, flags=0, state=None): + # parse 're' pattern into list of (opcode, argument) tuples + + source = Tokenizer(str) + + if state is None: + state = State() + state.flags = flags + state.str = str + + p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) + p.state.flags = fix_flags(str, p.state.flags) + + if source.next is not None: + assert source.next == ")" + raise source.error("unbalanced parenthesis") + + for g in p.state.grouprefpos: + if g >= p.state.groups: + msg = "invalid group reference %d" % g + raise error(msg, str, p.state.grouprefpos[g]) + + if flags & SRE_FLAG_DEBUG: + p.dump() + + return p + +def parse_template(source, pattern): + # parse 're' replacement string into list of literals and + # group references + s = Tokenizer(source) + sget = s.get + result = [] + literal = [] + lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] + def addgroup(index, pos): + if index > pattern.groups: + raise s.error("invalid group reference %d" % index, pos) + addliteral() + result.append(index) + groupindex = pattern.groupindex + while True: + this = sget() + if this is None: + break # end of replacement string + if this[0] == "\\": + # group + c = this[1] + if c == "g": + if not s.match("<"): + raise s.error("missing <") + name = s.getuntil(">", "group name") + if not (name.isdecimal() and name.isascii()): + s.checkgroupname(name, 1) + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) from None + else: + index = int(name) + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) + if not (name.isdecimal() and name.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(name) if s.istext else ascii(name), + s.tell() - len(name) - 1), + DeprecationWarning, stacklevel=5 + ) + addgroup(index, len(name) + 1) + elif c == "0": + if s.next in OCTDIGITS: + this += sget() + if s.next in OCTDIGITS: + this += sget() + lappend(chr(int(this[1:], 8) & 0xff)) + elif c in DIGITS: + isoctal = False + if s.next in DIGITS: + this += sget() + if (c in OCTDIGITS and this[2] in OCTDIGITS and + s.next in OCTDIGITS): + this += sget() + isoctal = True + c = int(this[1:], 8) + if c > 0o377: + raise s.error('octal escape value %s outside of ' + 'range 0-0o377' % this, len(this)) + lappend(chr(c)) + if not isoctal: + addgroup(int(this[1:]), len(this) - 1) + else: + try: + this = chr(ESCAPES[this][1]) + except KeyError: + if c in ASCIILETTERS: + raise s.error('bad escape %s' % this, len(this)) from None + lappend(this) + else: + lappend(this) + addliteral() + return result diff --git a/crates/weavepy-vm/src/stdlib/python/sre_compile.py b/crates/weavepy-vm/src/stdlib/python/sre_compile.py new file mode 100644 index 0000000..41cc5bd --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/sre_compile.py @@ -0,0 +1,11 @@ +"""Internal support module for sre (deprecated alias for re._compiler).""" + +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, stacklevel=2) + +from re import _compiler +globals().update({k: v for k, v in vars(_compiler).items() + if not k.startswith('__')}) + +del warnings, _compiler diff --git a/crates/weavepy-vm/src/stdlib/python/sre_constants.py b/crates/weavepy-vm/src/stdlib/python/sre_constants.py new file mode 100644 index 0000000..b895082 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/sre_constants.py @@ -0,0 +1,11 @@ +"""Internal support module for sre (deprecated alias for re._constants).""" + +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, stacklevel=2) + +from re import _constants +globals().update({k: v for k, v in vars(_constants).items() + if not k.startswith('__')}) + +del warnings, _constants diff --git a/crates/weavepy-vm/src/stdlib/python/sre_parse.py b/crates/weavepy-vm/src/stdlib/python/sre_parse.py new file mode 100644 index 0000000..13d9bf2 --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/python/sre_parse.py @@ -0,0 +1,11 @@ +"""Internal support module for sre (deprecated alias for re._parser).""" + +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, stacklevel=2) + +from re import _parser +globals().update({k: v for k, v in vars(_parser).items() + if not k.startswith('__')}) + +del warnings, _parser diff --git a/crates/weavepy-vm/src/stdlib/re.rs b/crates/weavepy-vm/src/stdlib/re.rs deleted file mode 100644 index d6e92cd..0000000 --- a/crates/weavepy-vm/src/stdlib/re.rs +++ /dev/null @@ -1,1089 +0,0 @@ -//! The `re` built-in module. -//! -//! Backed by Rust's `regex` crate. The user-visible API mirrors -//! CPython's `re` module for the common functions (`match`, -//! `search`, `findall`, `finditer`, `sub`, `split`, `compile`). -//! -//! We do not support every CPython feature: backreferences in the -//! pattern (e.g. `(?P=name)`) and lookaround (`(?=...)` / `(?<=...)`) -//! are limited by the underlying engine. The dialect is close enough -//! that the vast majority of everyday patterns work as expected. - -use crate::sync::Rc; -use crate::sync::RefCell; - -use regex::{Captures, Regex}; - -use crate::error::{type_error, value_error, RuntimeError}; -use crate::import::ModuleCache; -use crate::object::{BuiltinFn, DictData, DictKey, Object, PyModule}; -use crate::types::{PyInstance, TypeObject}; - -pub fn build(_cache: &ModuleCache) -> Rc { - let dict = Rc::new(RefCell::new(DictData::new())); - { - let mut d = dict.borrow_mut(); - d.insert( - DictKey(Object::from_static("__name__")), - Object::from_static("re"), - ); - d.insert( - DictKey(Object::from_static("__doc__")), - Object::from_static("Support for regular expressions."), - ); - d.insert(DictKey(Object::from_static("IGNORECASE")), Object::Int(2)); - d.insert(DictKey(Object::from_static("I")), Object::Int(2)); - d.insert(DictKey(Object::from_static("MULTILINE")), Object::Int(8)); - d.insert(DictKey(Object::from_static("M")), Object::Int(8)); - d.insert(DictKey(Object::from_static("DOTALL")), Object::Int(16)); - d.insert(DictKey(Object::from_static("S")), Object::Int(16)); - d.insert(DictKey(Object::from_static("VERBOSE")), Object::Int(64)); - d.insert(DictKey(Object::from_static("X")), Object::Int(64)); - d.insert(DictKey(Object::from_static("ASCII")), Object::Int(256)); - d.insert(DictKey(Object::from_static("A")), Object::Int(256)); - d.insert(DictKey(Object::from_static("match")), b("match", re_match)); - d.insert( - DictKey(Object::from_static("search")), - b("search", re_search), - ); - d.insert( - DictKey(Object::from_static("fullmatch")), - b("fullmatch", re_fullmatch), - ); - d.insert( - DictKey(Object::from_static("findall")), - b("findall", re_findall), - ); - d.insert( - DictKey(Object::from_static("finditer")), - b("finditer", re_finditer), - ); - d.insert(DictKey(Object::from_static("sub")), b("sub", re_sub)); - d.insert(DictKey(Object::from_static("subn")), b("subn", re_subn)); - d.insert(DictKey(Object::from_static("split")), b("split", re_split)); - d.insert( - DictKey(Object::from_static("compile")), - b("compile", re_compile), - ); - d.insert( - DictKey(Object::from_static("escape")), - b("escape", re_escape), - ); - d.insert( - DictKey(Object::from_static("error")), - Object::Type(re_error_type()), - ); - } - Rc::new(PyModule { - name: "re".to_owned(), - filename: None, - dict, - }) -} - -fn b(name: &'static str, body: fn(&[Object]) -> Result) -> Object { - Object::Builtin(Rc::new(BuiltinFn { - name, - call: Box::new(body), - call_kw: None, - })) -} - -fn re_error_type() -> Rc { - let bt = crate::builtin_types::builtin_types(); - TypeObject::new_user("error", vec![bt.value_error.clone()], DictData::new()) - .unwrap_or_else(|_| bt.value_error.clone()) -} - -/// Convert a Python regex pattern to one accepted by `regex`. We -/// rewrite the most common CPython-only shortcuts: `\A` (string -/// start) and `\Z` (string end) are kept as-is (regex supports them -/// as `\A` and `\z` respectively, but for our purposes we treat them -/// equivalently to anchors). -fn compile_pattern(pat: &str, flags: i64) -> Result { - let mut translated = pat.replace("\\Z", "\\z"); - // Python's `(?P...)` is supported by `regex` natively. - let mut builder = regex::RegexBuilder::new(&translated); - if flags & 2 != 0 { - builder.case_insensitive(true); - } - if flags & 8 != 0 { - builder.multi_line(true); - } - if flags & 16 != 0 { - builder.dot_matches_new_line(true); - } - if flags & 64 != 0 { - builder.ignore_whitespace(true); - } - // `regex` rejects some Python escapes (`\d` defaults to ASCII in - // Python 3 unless `re.UNICODE`); our build treats `\d`/`\w`/`\s` - // as Unicode-aware, matching CPython 3 defaults. - builder.build().or_else(|_| { - // Some patterns contain literal `(?P=name)` backrefs we can't - // support; if so, fall back to a verbose error. - translated = pat.to_owned(); - builder = regex::RegexBuilder::new(&translated); - builder - .build() - .map_err(|e| value_error(format!("invalid pattern: {e}"))) - }) -} - -/// Compile with the `fancy-regex` engine. Used as a fallback when -/// the base `regex` crate rejects the pattern — typically because -/// of CPython features `regex` doesn't implement (lookaround, -/// backreferences). Returned eagerly so callers can decide whether -/// to fall back without paying the cost on every successful -/// compile. -fn compile_pattern_fancy(pat: &str, flags: i64) -> Result { - let mut translated = pat.replace("\\Z", "\\z"); - // Apply inline flag prefix so the same CPython flag bits steer - // the fancy engine. - let mut prefix = String::new(); - if flags & 2 != 0 { - prefix.push('i'); - } - if flags & 8 != 0 { - prefix.push('m'); - } - if flags & 16 != 0 { - prefix.push('s'); - } - if flags & 64 != 0 { - prefix.push('x'); - } - if !prefix.is_empty() { - translated = format!("(?{prefix}){translated}"); - } - fancy_regex::Regex::new(&translated).map_err(|e| value_error(format!("invalid pattern: {e}"))) -} - -/// Public alias exposed to the VM dispatcher so it can route -/// callable-replacement ``re.sub`` calls itself. -pub fn extract_pattern_pub(arg: &Object) -> Result<(String, i64), RuntimeError> { - extract_pattern(arg) -} - -/// Public helper: collect every non-overlapping match span + -/// captures of ``pat`` over ``text``. Used by the VM-routed -/// ``re.sub`` callable path so the actual ``repl(match)`` calls -/// happen on the interpreter side. -pub fn collect_all_matches( - pat: &str, - flags: i64, - text: &str, -) -> Result>)>, RuntimeError> { - let mut out: Vec<(usize, usize, Vec>)> = Vec::new(); - let mut on_match = |s: usize, e: usize, groups: &[Option<(usize, usize)>]| { - out.push((s, e, groups.to_vec())); - }; - iter_all_matches(pat, flags, text, &mut on_match)?; - Ok(out) -} - -/// Build a ``re.Match`` object compatible with the rest of the -/// module from a pre-extracted set of group spans. -pub fn build_match_object( - pat: &str, - text: &str, - groups: &[Option<(usize, usize)>], - _full_start: usize, - _full_end: usize, -) -> Object { - let caps = DualCaptures { - groups: groups.to_vec(), - named: Vec::new(), - }; - make_match_from_captured(pat, text, &caps, text, 0) -} - -fn extract_pattern(arg: &Object) -> Result<(String, i64), RuntimeError> { - match arg { - Object::Str(s) => Ok((s.to_string(), 0)), - Object::Instance(inst) if inst.class.name == "Pattern" => { - let pat = inst - .dict - .borrow() - .get(&DictKey(Object::from_static("pattern"))) - .cloned() - .unwrap_or(Object::from_static("")); - let flags = inst - .dict - .borrow() - .get(&DictKey(Object::from_static("flags"))) - .cloned() - .unwrap_or(Object::Int(0)); - let p = match pat { - Object::Str(s) => s.to_string(), - _ => return Err(type_error("invalid Pattern object")), - }; - let f = match flags { - Object::Int(i) => i, - _ => 0, - }; - Ok((p, f)) - } - _ => Err(type_error( - "first argument must be string or compiled pattern", - )), - } -} - -thread_local! { - static PATTERN_CLASS: RefCell>> = const { RefCell::new(None) }; - static MATCH_CLASS: RefCell>> = const { RefCell::new(None) }; -} - -fn pattern_class() -> Rc { - PATTERN_CLASS.with(|slot| { - if let Some(c) = slot.borrow().as_ref() { - return c.clone(); - } - let bt = crate::builtin_types::builtin_types(); - let mut dict = DictData::new(); - for (name, method) in pattern_methods() { - dict.insert(DictKey(Object::from_str(name)), method); - } - let cls = - TypeObject::new_user("Pattern", vec![bt.object_.clone()], dict).expect("Pattern type"); - *slot.borrow_mut() = Some(cls.clone()); - cls - }) -} - -fn match_class() -> Rc { - MATCH_CLASS.with(|slot| { - if let Some(c) = slot.borrow().as_ref() { - return c.clone(); - } - let bt = crate::builtin_types::builtin_types(); - let mut dict = DictData::new(); - for (name, method) in match_methods() { - dict.insert(DictKey(Object::from_str(name)), method); - } - let cls = - TypeObject::new_user("Match", vec![bt.object_.clone()], dict).expect("Match type"); - *slot.borrow_mut() = Some(cls.clone()); - cls - }) -} - -fn re_compile(args: &[Object]) -> Result { - let pat = match args.first() { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("compile() expects str pattern")), - }; - let flags = match args.get(1) { - Some(Object::Int(i)) => *i, - None => 0, - _ => return Err(type_error("flags must be int")), - }; - // Validate by compiling now; we store the source. - let _ = compile_pattern(&pat, flags)?; - Ok(make_pattern(pat, flags)) -} - -fn make_pattern(pattern: String, flags: i64) -> Object { - let inst = PyInstance::new(pattern_class()); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("pattern")), - Object::from_str(pattern), - ); - inst.dict - .borrow_mut() - .insert(DictKey(Object::from_static("flags")), Object::Int(flags)); - Object::Instance(Rc::new(inst)) -} - -fn pattern_methods() -> Vec<(&'static str, Object)> { - vec![ - ("match", b("match", pattern_match)), - ("search", b("search", pattern_search)), - ("fullmatch", b("fullmatch", pattern_fullmatch)), - ("findall", b("findall", pattern_findall)), - ("finditer", b("finditer", pattern_finditer)), - ("sub", b("sub", pattern_sub)), - ("split", b("split", pattern_split)), - ] -} - -fn pattern_match(args: &[Object]) -> Result { - run_match(args, true, false) -} -fn pattern_search(args: &[Object]) -> Result { - run_match(args, false, false) -} -fn pattern_fullmatch(args: &[Object]) -> Result { - run_match(args, true, true) -} -fn pattern_findall(args: &[Object]) -> Result { - re_findall(args) -} -fn pattern_finditer(args: &[Object]) -> Result { - re_finditer(args) -} -fn pattern_sub(args: &[Object]) -> Result { - re_sub(args) -} -fn pattern_split(args: &[Object]) -> Result { - re_split(args) -} - -fn re_escape(args: &[Object]) -> Result { - let s = match args.first() { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("escape() expects str")), - }; - Ok(Object::from_str(regex::escape(&s))) -} - -fn run_match( - args: &[Object], - require_start: bool, - fullmatch: bool, -) -> Result { - let first = args - .first() - .ok_or_else(|| type_error("expected pattern argument"))?; - let from_pattern = matches!(first, Object::Instance(inst) if inst.class.name == "Pattern"); - let (pat, default_flags) = extract_pattern(first)?; - let text = match args.get(1) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("expected str input")), - }; - // Pattern method form: `pattern.match(s, pos=0, endpos=len(s))`. - // Module-level form: `re.match(pattern, s, flags=0)`. - let (flags, pos, endpos) = if from_pattern { - let pos = match args.get(2) { - Some(Object::Int(i)) => *i, - _ => 0, - }; - let endpos = match args.get(3) { - Some(Object::Int(i)) => *i, - _ => text.chars().count() as i64, - }; - (default_flags, pos, endpos) - } else { - let flags = match args.get(2) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - (flags, 0i64, text.chars().count() as i64) - }; - let start_byte = char_index_to_byte(&text, pos.max(0) as usize); - let end_byte = char_index_to_byte(&text, endpos.max(0) as usize); - if start_byte > end_byte || start_byte > text.len() { - return Ok(Object::None); - } - let slice_end = end_byte.min(text.len()); - let slice = &text[start_byte..slice_end]; - let captured = match dual_captures(&pat, flags, slice)? { - Some(c) => c, - None => return Ok(Object::None), - }; - let span0 = captured.groups[0].expect("group 0 always present"); - if require_start && span0.0 != 0 { - return Ok(Object::None); - } - if fullmatch && (span0.0 != 0 || span0.1 != slice.len()) { - return Ok(Object::None); - } - Ok(make_match_from_captured( - &pat, &text, &captured, slice, start_byte, - )) -} - -/// A capture result that hides which engine produced it. Spans are -/// byte offsets into the *slice* the caller passed; the caller adds -/// any base offset back. -struct DualCaptures { - groups: Vec>, - /// Ordered ``(name, Option)`` pairs for named groups. - /// Group indices line up with ``groups``. - named: Vec<(String, usize)>, -} - -fn dual_captures(pat: &str, flags: i64, slice: &str) -> Result, RuntimeError> { - if let Ok(re) = compile_pattern(pat, flags) { - if let Some(caps) = re.captures(slice) { - let mut groups = Vec::with_capacity(caps.len()); - for i in 0..caps.len() { - groups.push(caps.get(i).map(|m| (m.start(), m.end()))); - } - let mut named = Vec::new(); - for (i, name) in re.capture_names().enumerate() { - if let Some(n) = name { - named.push((n.to_owned(), i)); - } - } - return Ok(Some(DualCaptures { groups, named })); - } - return Ok(None); - } - // Fallback to fancy-regex. - let re = compile_pattern_fancy(pat, flags)?; - let cap = re - .captures(slice) - .map_err(|e| value_error(format!("regex error: {e}")))?; - let caps = match cap { - Some(c) => c, - None => return Ok(None), - }; - let mut groups = Vec::with_capacity(caps.len()); - for i in 0..caps.len() { - groups.push(caps.get(i).map(|m| (m.start(), m.end()))); - } - let mut named = Vec::new(); - for (i, name) in re.capture_names().enumerate() { - if let Some(n) = name { - named.push((n.to_owned(), i)); - } - } - Ok(Some(DualCaptures { groups, named })) -} - -fn make_match_from_captured( - pat: &str, - text: &str, - caps: &DualCaptures, - slice: &str, - base_offset: usize, -) -> Object { - let inst = PyInstance::new(match_class()); - let span0 = caps.groups[0].expect("group 0 always present"); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("string")), - Object::from_str(text.to_owned()), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("re")), - Object::from_str(pat.to_owned()), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("pos")), - Object::Int(base_offset as i64), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("endpos")), - Object::Int(text.len() as i64), - ); - let mut groups: Vec = Vec::new(); - let mut spans: Vec = Vec::new(); - for span in &caps.groups { - match span { - Some((s, e)) => { - groups.push(Object::from_str(slice[*s..*e].to_owned())); - spans.push(Object::new_tuple(vec![ - Object::Int((s + base_offset) as i64), - Object::Int((e + base_offset) as i64), - ])); - } - None => { - groups.push(Object::None); - spans.push(Object::new_tuple(vec![Object::Int(-1), Object::Int(-1)])); - } - } - } - let mut named_dict = DictData::new(); - for (name, idx) in &caps.named { - let val = match caps.groups.get(*idx).copied().flatten() { - Some((s, e)) => Object::from_str(slice[s..e].to_owned()), - None => Object::None, - }; - named_dict.insert(DictKey(Object::from_str(name.clone())), val); - } - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_groups")), - Object::new_tuple(groups), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_spans")), - Object::new_tuple(spans), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_named")), - Object::Dict(Rc::new(RefCell::new(named_dict))), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_full_start")), - Object::Int((span0.0 + base_offset) as i64), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_full_end")), - Object::Int((span0.1 + base_offset) as i64), - ); - Object::Instance(Rc::new(inst)) -} - -fn char_index_to_byte(s: &str, n: usize) -> usize { - for (count, (i, _)) in s.char_indices().enumerate() { - if count == n { - return i; - } - } - s.len() -} - -fn re_match(args: &[Object]) -> Result { - run_match(args, true, false) -} - -fn re_search(args: &[Object]) -> Result { - run_match(args, false, false) -} - -fn re_fullmatch(args: &[Object]) -> Result { - run_match(args, true, true) -} - -#[allow(dead_code)] -fn make_match( - pat: &str, - text: &str, - caps: &Captures<'_>, - re: &Regex, - base_offset: usize, -) -> Object { - let inst = PyInstance::new(match_class()); - let m0 = caps.get(0).expect("at least one capture"); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("string")), - Object::from_str(text.to_owned()), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("re")), - Object::from_str(pat.to_owned()), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("pos")), - Object::Int(base_offset as i64), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("endpos")), - Object::Int(text.len() as i64), - ); - let mut groups: Vec = Vec::new(); - for i in 0..caps.len() { - match caps.get(i) { - Some(m) => groups.push(Object::from_str(m.as_str().to_owned())), - None => groups.push(Object::None), - } - } - let mut spans: Vec = Vec::new(); - for i in 0..caps.len() { - match caps.get(i) { - Some(m) => spans.push(Object::new_tuple(vec![ - Object::Int((m.start() + base_offset) as i64), - Object::Int((m.end() + base_offset) as i64), - ])), - None => spans.push(Object::new_tuple(vec![Object::Int(-1), Object::Int(-1)])), - } - } - let mut named = DictData::new(); - for name in re.capture_names().flatten() { - let val = caps - .name(name) - .map(|m| Object::from_str(m.as_str().to_owned())) - .unwrap_or(Object::None); - named.insert(DictKey(Object::from_str(name.to_owned())), val); - } - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_groups")), - Object::new_tuple(groups), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_spans")), - Object::new_tuple(spans), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_named")), - Object::Dict(Rc::new(RefCell::new(named))), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_full_start")), - Object::Int((m0.start() + base_offset) as i64), - ); - inst.dict.borrow_mut().insert( - DictKey(Object::from_static("_full_end")), - Object::Int((m0.end() + base_offset) as i64), - ); - Object::Instance(Rc::new(inst)) -} - -fn match_methods() -> Vec<(&'static str, Object)> { - vec![ - ("group", b("group", match_group)), - ("groups", b("groups", match_groups_method)), - ("groupdict", b("groupdict", match_groupdict)), - ("start", b("start", match_start)), - ("end", b("end", match_end)), - ("span", b("span", match_span)), - ] -} - -fn match_self(args: &[Object]) -> Result, RuntimeError> { - match args.first() { - Some(Object::Instance(i)) if i.class.name == "Match" => Ok(i.clone()), - _ => Err(type_error("expected Match receiver")), - } -} - -fn match_group(args: &[Object]) -> Result { - let m = match_self(args)?; - let groups = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_groups"))) - .cloned(); - let named = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_named"))) - .cloned(); - let groups_tuple = match groups { - Some(Object::Tuple(t)) => t, - _ => return Err(type_error("invalid Match groups")), - }; - let lookup = |idx: &Object| -> Result { - match idx { - Object::Int(i) => groups_tuple - .get(*i as usize) - .cloned() - .ok_or_else(|| value_error("no such group")), - Object::Str(s) => match named { - Some(Object::Dict(ref d)) => d - .borrow() - .get(&DictKey(Object::from_str(s.to_string()))) - .cloned() - .ok_or_else(|| value_error("no such group")), - _ => Err(value_error("no named groups")), - }, - _ => Err(type_error("group key must be int or str")), - } - }; - let arg_indices = &args[1..]; - if arg_indices.is_empty() { - return Ok(groups_tuple.first().cloned().unwrap_or(Object::None)); - } - if arg_indices.len() == 1 { - return lookup(&arg_indices[0]); - } - let mut out = Vec::with_capacity(arg_indices.len()); - for a in arg_indices { - out.push(lookup(a)?); - } - Ok(Object::new_tuple(out)) -} - -fn match_groups_method(args: &[Object]) -> Result { - let m = match_self(args)?; - let groups = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_groups"))) - .cloned(); - let default = args.get(1).cloned().unwrap_or(Object::None); - match groups { - Some(Object::Tuple(t)) => { - let out: Vec = t - .iter() - .skip(1) - .cloned() - .map(|v| { - if matches!(v, Object::None) { - default.clone() - } else { - v - } - }) - .collect(); - Ok(Object::new_tuple(out)) - } - _ => Err(type_error("invalid Match groups")), - } -} - -fn match_groupdict(args: &[Object]) -> Result { - let m = match_self(args)?; - let named = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_named"))) - .cloned(); - match named { - Some(Object::Dict(d)) => Ok(Object::Dict(d.clone())), - _ => Ok(Object::new_dict()), - } -} - -fn match_start(args: &[Object]) -> Result { - let m = match_self(args)?; - let idx = args.get(1).cloned().unwrap_or(Object::Int(0)); - let i = match idx { - Object::Int(i) => i, - _ => return Err(type_error("start() expected int")), - }; - let spans = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_spans"))) - .cloned(); - match spans { - Some(Object::Tuple(spans)) => match spans.get(i as usize) { - Some(Object::Tuple(t)) => Ok(t[0].clone()), - _ => Err(value_error("no such group")), - }, - _ => Err(type_error("invalid Match spans")), - } -} - -fn match_end(args: &[Object]) -> Result { - let m = match_self(args)?; - let idx = args.get(1).cloned().unwrap_or(Object::Int(0)); - let i = match idx { - Object::Int(i) => i, - _ => return Err(type_error("end() expected int")), - }; - let spans = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_spans"))) - .cloned(); - match spans { - Some(Object::Tuple(spans)) => match spans.get(i as usize) { - Some(Object::Tuple(t)) => Ok(t[1].clone()), - _ => Err(value_error("no such group")), - }, - _ => Err(type_error("invalid Match spans")), - } -} - -fn match_span(args: &[Object]) -> Result { - let m = match_self(args)?; - let idx = args.get(1).cloned().unwrap_or(Object::Int(0)); - let i = match idx { - Object::Int(i) => i, - _ => return Err(type_error("span() expected int")), - }; - let spans = m - .dict - .borrow() - .get(&DictKey(Object::from_static("_spans"))) - .cloned(); - match spans { - Some(Object::Tuple(spans)) => spans - .get(i as usize) - .cloned() - .ok_or_else(|| value_error("no such group")), - _ => Err(type_error("invalid Match spans")), - } -} - -fn re_findall(args: &[Object]) -> Result { - let (pat, default_flags) = - extract_pattern(args.first().ok_or_else(|| type_error("expected pattern"))?)?; - let text = match args.get(1) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("expected str")), - }; - let flags = match args.get(2) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - let mut out = Vec::new(); - let mut on_match = |_s: usize, _e: usize, groups: &[Option<(usize, usize)>]| { - let has_groups = groups.len() > 1; - if has_groups { - let group_count = groups.len() - 1; - if group_count == 1 { - let s = groups[1].map_or(String::new(), |(s, e)| text[s..e].to_owned()); - out.push(Object::from_str(s)); - } else { - let mut tup = Vec::with_capacity(group_count); - for g in groups.iter().skip(1).take(group_count) { - let s = g.map_or(String::new(), |(s, e)| text[s..e].to_owned()); - tup.push(Object::from_str(s)); - } - out.push(Object::new_tuple(tup)); - } - } else { - let s = groups[0].map_or(String::new(), |(s, e)| text[s..e].to_owned()); - out.push(Object::from_str(s)); - } - }; - iter_all_matches(&pat, flags, &text, &mut on_match)?; - Ok(Object::new_list(out)) -} - -fn re_finditer(args: &[Object]) -> Result { - let (pat, default_flags) = - extract_pattern(args.first().ok_or_else(|| type_error("expected pattern"))?)?; - let text = match args.get(1) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("expected str")), - }; - let flags = match args.get(2) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - let mut out = Vec::new(); - let mut consume_groups = |start: usize, _end: usize, groups: &[Option<(usize, usize)>]| { - let groups_vec = groups.to_vec(); - let _ = start; - let caps = DualCaptures { - groups: groups_vec, - named: Vec::new(), - }; - out.push(make_match_from_captured(&pat, &text, &caps, &text, 0)); - }; - iter_all_matches(&pat, flags, &text, &mut consume_groups)?; - Ok(Object::new_list(out)) -} - -/// Walk every non-overlapping match in ``text`` and invoke ``f`` -/// with the byte span and capture groups. Falls back to the -/// ``fancy_regex`` engine if the base ``regex`` can't compile the -/// pattern. -fn iter_all_matches( - pat: &str, - flags: i64, - text: &str, - f: &mut dyn FnMut(usize, usize, &[Option<(usize, usize)>]), -) -> Result<(), RuntimeError> { - match compile_pattern(pat, flags) { - Ok(re) => { - for caps in re.captures_iter(text) { - let mut groups = Vec::with_capacity(caps.len()); - for i in 0..caps.len() { - groups.push(caps.get(i).map(|m| (m.start(), m.end()))); - } - let m = caps.get(0).unwrap(); - f(m.start(), m.end(), &groups); - } - Ok(()) - } - Err(_) => { - let re = compile_pattern_fancy(pat, flags)?; - for caps in re.captures_iter(text) { - let caps = caps.map_err(|e| value_error(format!("regex error: {e}")))?; - let mut groups = Vec::with_capacity(caps.len()); - for i in 0..caps.len() { - groups.push(caps.get(i).map(|m| (m.start(), m.end()))); - } - let m = caps.get(0).unwrap(); - f(m.start(), m.end(), &groups); - } - Ok(()) - } - } -} - -fn re_sub(args: &[Object]) -> Result { - let (s, _) = re_sub_impl(args)?; - Ok(Object::from_str(s)) -} - -fn re_subn(args: &[Object]) -> Result { - let (s, n) = re_sub_impl(args)?; - Ok(Object::new_tuple(vec![Object::from_str(s), Object::Int(n)])) -} - -fn re_sub_impl(args: &[Object]) -> Result<(String, i64), RuntimeError> { - let (pat, default_flags) = - extract_pattern(args.first().ok_or_else(|| type_error("expected pattern"))?)?; - let repl = match args.get(1) { - Some(Object::Str(s)) => s.to_string(), - Some(Object::Function(_)) | Some(Object::Builtin(_)) | Some(Object::BoundMethod(_)) => { - // ``re.sub`` with a callable replacement requires - // calling back into the VM. The VM intercepts the - // ``sub`` builtin (see ``do_re_sub_call`` in - // ``lib.rs``) and routes those calls itself, so the - // pure-data path here only services the string form. - return Err(type_error( - "internal: callable re.sub should be handled at the VM dispatch layer", - )); - } - _ => return Err(type_error("repl must be str or callable")), - }; - let text = match args.get(2) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("expected str")), - }; - let count = match args.get(3) { - Some(Object::Int(i)) => *i, - _ => 0, - }; - let flags = match args.get(4) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - let mut out = String::new(); - let mut last_end = 0usize; - let mut replacements = 0i64; - let mut on_match = |s: usize, e: usize, groups: &[Option<(usize, usize)>]| { - if count > 0 && replacements >= count { - return; - } - out.push_str(&text[last_end..s]); - out.push_str(&expand_replacement_from_groups(&repl, groups, &text)); - last_end = e; - replacements += 1; - }; - iter_all_matches(&pat, flags, &text, &mut on_match)?; - out.push_str(&text[last_end..]); - Ok((out, replacements)) -} - -/// Same expansion rules as ``expand_replacement`` but driven by -/// pre-extracted group spans rather than a regex ``Captures``. -fn expand_replacement_from_groups( - repl: &str, - groups: &[Option<(usize, usize)>], - text: &str, -) -> String { - let mut out = String::new(); - let bytes = repl.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - let next = bytes[i + 1]; - if next.is_ascii_digit() { - let idx = (next - b'0') as usize; - if let Some(Some((s, e))) = groups.get(idx).copied() { - out.push_str(&text[s..e]); - } - i += 2; - } else if next == b'g' && i + 2 < bytes.len() && bytes[i + 2] == b'<' { - let close = bytes[i + 3..] - .iter() - .position(|b| *b == b'>') - .map(|p| i + 3 + p); - if let Some(end) = close { - let name = &repl[i + 3..end]; - if let Ok(n) = name.parse::() { - if let Some(Some((s, e))) = groups.get(n).copied() { - out.push_str(&text[s..e]); - } - } - i = end + 1; - continue; - } - out.push('\\'); - i += 1; - } else if next == b'n' { - out.push('\n'); - i += 2; - } else if next == b't' { - out.push('\t'); - i += 2; - } else if next == b'\\' { - out.push('\\'); - i += 2; - } else { - out.push('\\'); - out.push(next as char); - i += 2; - } - } else { - out.push(bytes[i] as char); - i += 1; - } - } - out -} - -/// Expand `\1` / `\g` etc. inside a `re.sub` replacement. -#[allow(dead_code)] -fn expand_replacement(repl: &str, caps: &Captures<'_>) -> String { - let mut out = String::new(); - let bytes = repl.as_bytes(); - let mut i = 0; - while i < bytes.len() { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - let next = bytes[i + 1]; - if next.is_ascii_digit() { - let idx = (next - b'0') as usize; - if let Some(m) = caps.get(idx) { - out.push_str(m.as_str()); - } - i += 2; - } else if next == b'g' && i + 2 < bytes.len() && bytes[i + 2] == b'<' { - let close = bytes[i + 3..] - .iter() - .position(|b| *b == b'>') - .map(|p| i + 3 + p); - if let Some(end) = close { - let name = &repl[i + 3..end]; - if let Ok(n) = name.parse::() { - if let Some(m) = caps.get(n) { - out.push_str(m.as_str()); - } - } else if let Some(m) = caps.name(name) { - out.push_str(m.as_str()); - } - i = end + 1; - continue; - } - out.push('\\'); - i += 1; - } else if next == b'n' { - out.push('\n'); - i += 2; - } else if next == b't' { - out.push('\t'); - i += 2; - } else if next == b'\\' { - out.push('\\'); - i += 2; - } else { - out.push('\\'); - out.push(next as char); - i += 2; - } - } else { - let ch_len = if bytes[i] < 0x80 { 1 } else { 1 }; - out.push(bytes[i] as char); - i += ch_len; - } - } - out -} - -fn re_split(args: &[Object]) -> Result { - let (pat, default_flags) = - extract_pattern(args.first().ok_or_else(|| type_error("expected pattern"))?)?; - let text = match args.get(1) { - Some(Object::Str(s)) => s.to_string(), - _ => return Err(type_error("expected str")), - }; - let maxsplit = match args.get(2) { - Some(Object::Int(i)) => *i, - _ => 0, - }; - let flags = match args.get(3) { - Some(Object::Int(i)) => *i, - _ => default_flags, - }; - let re = compile_pattern(&pat, flags)?; - let mut out = Vec::new(); - let mut last_end = 0; - for (splits, caps) in re.captures_iter(&text).enumerate() { - if maxsplit > 0 && splits as i64 >= maxsplit { - break; - } - let m = caps.get(0).expect("capture 0"); - out.push(Object::from_str(text[last_end..m.start()].to_owned())); - // Include captured groups as separate output elements (Python - // semantics). - for i in 1..caps.len() { - match caps.get(i) { - Some(g) => out.push(Object::from_str(g.as_str().to_owned())), - None => out.push(Object::None), - } - } - last_end = m.end(); - } - out.push(Object::from_str(text[last_end..].to_owned())); - Ok(Object::new_list(out)) -} diff --git a/crates/weavepy-vm/src/stdlib/sre_mod.rs b/crates/weavepy-vm/src/stdlib/sre_mod.rs new file mode 100644 index 0000000..deb821a --- /dev/null +++ b/crates/weavepy-vm/src/stdlib/sre_mod.rs @@ -0,0 +1,1557 @@ +//! The native `_sre` module: WeavePy's faithful port of CPython's +//! secret-labs regular-expression engine (RFC 0035). +//! +//! This is a direct, line-for-line translation of the backtracking +//! matcher in CPython 3.13's `Modules/_sre/sre_lib.h` (the `SRE(match)` +//! / `SRE(count)` / `SRE(charset)` / `SRE(search)` templated +//! functions). It consumes the exact same compiled int-code emitted by +//! the frozen Python `re._compiler`, so behaviour — including +//! lookaround, backreferences, atomic groups, possessive quantifiers, +//! conditional groups and the precise greedy/lazy backtracking order — +//! matches CPython. +//! +//! The public Python surface (`Pattern` / `Match` objects, `sub`, +//! `split`, `finditer`, …) lives in the frozen `re` package; this +//! module only exposes the primitive matching core plus the +//! case-folding helpers the compiler needs. +//! +//! Strings are matched over code-point arrays, so every position +//! returned (group spans, `pos`, `endpos`) is a Python code-point +//! index, exactly like CPython. Byte patterns are matched over the raw +//! byte values (each byte widened to a `u32`). + +use crate::error::{runtime_error, type_error, value_error, RuntimeError}; +use crate::import::ModuleCache; +use crate::object::{BuiltinFn, DictData, DictKey, Object, PyModule}; +use crate::sync::Rc; +use crate::sync::RefCell; + +// --------------------------------------------------------------------------- +// Constants (mirrors re/_constants.py and sre_constants.h) +// --------------------------------------------------------------------------- + +pub const MAGIC: i64 = 20_230_612; +pub const CODESIZE: i64 = 4; +/// `SRE_MAXREPEAT` — the "unlimited" sentinel for `{m,}` style repeats. +const MAXREPEAT: u32 = 4_294_967_295; +const MAXREPEAT_I64: i64 = 4_294_967_295; +/// `SRE_MAXGROUPS`. +const MAXGROUPS: i64 = 2_147_483_647 / 2; + +// Opcodes — indices into re/_constants.py OPCODES (after trimming +// MIN_REPEAT / MAX_REPEAT, which never reach the compiled code). +const OP_FAILURE: u32 = 0; +const OP_SUCCESS: u32 = 1; +const OP_ANY: u32 = 2; +const OP_ANY_ALL: u32 = 3; +const OP_ASSERT: u32 = 4; +const OP_ASSERT_NOT: u32 = 5; +const OP_AT: u32 = 6; +const OP_BRANCH: u32 = 7; +const OP_CATEGORY: u32 = 8; +const OP_CHARSET: u32 = 9; +const OP_BIGCHARSET: u32 = 10; +const OP_GROUPREF: u32 = 11; +const OP_GROUPREF_EXISTS: u32 = 12; +const OP_IN: u32 = 13; +const OP_INFO: u32 = 14; +const OP_JUMP: u32 = 15; +const OP_LITERAL: u32 = 16; +const OP_MARK: u32 = 17; +const OP_MAX_UNTIL: u32 = 18; +const OP_MIN_UNTIL: u32 = 19; +const OP_NOT_LITERAL: u32 = 20; +const OP_NEGATE: u32 = 21; +const OP_RANGE: u32 = 22; +const OP_REPEAT: u32 = 23; +const OP_REPEAT_ONE: u32 = 24; +#[allow(dead_code)] // appears only in parser output, never in compiled code +const OP_SUBPATTERN: u32 = 25; +const OP_MIN_REPEAT_ONE: u32 = 26; +const OP_ATOMIC_GROUP: u32 = 27; +const OP_POSSESSIVE_REPEAT: u32 = 28; +const OP_POSSESSIVE_REPEAT_ONE: u32 = 29; +const OP_GROUPREF_IGNORE: u32 = 30; +const OP_IN_IGNORE: u32 = 31; +const OP_LITERAL_IGNORE: u32 = 32; +const OP_NOT_LITERAL_IGNORE: u32 = 33; +const OP_GROUPREF_LOC_IGNORE: u32 = 34; +const OP_IN_LOC_IGNORE: u32 = 35; +const OP_LITERAL_LOC_IGNORE: u32 = 36; +const OP_NOT_LITERAL_LOC_IGNORE: u32 = 37; +const OP_GROUPREF_UNI_IGNORE: u32 = 38; +const OP_IN_UNI_IGNORE: u32 = 39; +const OP_LITERAL_UNI_IGNORE: u32 = 40; +const OP_NOT_LITERAL_UNI_IGNORE: u32 = 41; +const OP_RANGE_UNI_IGNORE: u32 = 42; + +// AT codes. +const AT_BEGINNING: u32 = 0; +const AT_BEGINNING_LINE: u32 = 1; +const AT_BEGINNING_STRING: u32 = 2; +const AT_BOUNDARY: u32 = 3; +const AT_NON_BOUNDARY: u32 = 4; +const AT_END: u32 = 5; +const AT_END_LINE: u32 = 6; +const AT_END_STRING: u32 = 7; +const AT_LOC_BOUNDARY: u32 = 8; +const AT_LOC_NON_BOUNDARY: u32 = 9; +const AT_UNI_BOUNDARY: u32 = 10; +const AT_UNI_NON_BOUNDARY: u32 = 11; + +// Category codes. +const CAT_DIGIT: u32 = 0; +const CAT_NOT_DIGIT: u32 = 1; +const CAT_SPACE: u32 = 2; +const CAT_NOT_SPACE: u32 = 3; +const CAT_WORD: u32 = 4; +const CAT_NOT_WORD: u32 = 5; +const CAT_LINEBREAK: u32 = 6; +const CAT_NOT_LINEBREAK: u32 = 7; +const CAT_LOC_WORD: u32 = 8; +const CAT_LOC_NOT_WORD: u32 = 9; +const CAT_UNI_DIGIT: u32 = 10; +const CAT_UNI_NOT_DIGIT: u32 = 11; +const CAT_UNI_SPACE: u32 = 12; +const CAT_UNI_NOT_SPACE: u32 = 13; +const CAT_UNI_WORD: u32 = 14; +const CAT_UNI_NOT_WORD: u32 = 15; +const CAT_UNI_LINEBREAK: u32 = 16; +const CAT_UNI_NOT_LINEBREAK: u32 = 17; + +/// Guards against unbounded native recursion on pathological patterns +/// (CPython uses a heap-allocated context stack; we recurse on the +/// Rust stack and bail out with an error rather than crash). +const MAX_DEPTH: u32 = 10_000; + +// --------------------------------------------------------------------------- +// Compiled-pattern registry +// --------------------------------------------------------------------------- + +struct CompiledCode { + code: Vec, + groups: usize, +} + +thread_local! { + static REGISTRY: RefCell>> = const { RefCell::new(Vec::new()) }; +} + +// --------------------------------------------------------------------------- +// Case-folding helpers +// --------------------------------------------------------------------------- + +#[inline] +fn lower_ascii(ch: u32) -> u32 { + if (u32::from(b'A')..=u32::from(b'Z')).contains(&ch) { + ch + 32 + } else { + ch + } +} + +#[inline] +fn upper_ascii(ch: u32) -> u32 { + if (u32::from(b'a')..=u32::from(b'z')).contains(&ch) { + ch - 32 + } else { + ch + } +} + +fn lower_unicode(ch: u32) -> u32 { + match char::from_u32(ch) { + Some(c) => c.to_lowercase().next().map_or(ch, |c| c as u32), + None => ch, + } +} + +fn upper_unicode(ch: u32) -> u32 { + match char::from_u32(ch) { + Some(c) => c.to_uppercase().next().map_or(ch, |c| c as u32), + None => ch, + } +} + +// We approximate locale case folding with ASCII (CPython's behaviour is +// locale-dependent and LOCALE tests are largely skipped). +#[inline] +fn lower_locale(ch: u32) -> u32 { + lower_ascii(ch) +} +#[inline] +fn upper_locale(ch: u32) -> u32 { + upper_ascii(ch) +} + +#[inline] +fn char_loc_ignore(pat: u32, ch: u32) -> bool { + ch == pat || lower_locale(ch) == pat || upper_locale(ch) == pat +} + +fn unicode_iscased(ch: u32) -> bool { + let lo = lower_unicode(ch); + let up = upper_unicode(ch); + ch != lo || ch != up +} + +fn ascii_iscased(ch: u32) -> bool { + (u32::from(b'a')..=u32::from(b'z')).contains(&ch) || (u32::from(b'A')..=u32::from(b'Z')).contains(&ch) +} + +// --------------------------------------------------------------------------- +// Character classification (mirrors the SRE_IS_* / SRE_UNI_IS_* macros) +// --------------------------------------------------------------------------- + +#[inline] +fn is_linebreak(ch: u32) -> bool { + ch == u32::from(b'\n') +} + +#[inline] +fn ascii_digit(ch: u32) -> bool { + ch < 128 && (u32::from(b'0')..=u32::from(b'9')).contains(&ch) +} + +#[inline] +fn ascii_space(ch: u32) -> bool { + // ' ', \t, \n, \r, \v, \f + ch < 128 && matches!(ch, 0x20 | 0x09 | 0x0a | 0x0b | 0x0c | 0x0d) +} + +#[inline] +fn ascii_word(ch: u32) -> bool { + ch < 128 + && (ascii_digit(ch) + || (u32::from(b'a')..=u32::from(b'z')).contains(&ch) + || (u32::from(b'A')..=u32::from(b'Z')).contains(&ch) + || ch == u32::from(b'_')) +} + +#[inline] +fn loc_word(ch: u32) -> bool { + // Latin-1 alphanumeric or underscore. + if ch == u32::from(b'_') { + return true; + } + match char::from_u32(ch) { + Some(c) => ch < 256 && (c.is_alphanumeric()), + None => false, + } +} + +fn uni_digit(ch: u32) -> bool { + match char::from_u32(ch) { + // Py_UNICODE_ISDECIMAL — decimal digits: ASCII `0`-`9` plus the + // Unicode Decimal_Number (Nd) category for non-ASCII scripts. + Some(c) => c.is_ascii_digit() || nd_digit(c), + None => false, + } +} + +/// Best-effort Unicode decimal-digit (general category Nd) test for the +/// common non-ASCII blocks, so `\d` matches like CPython without a full +/// Unicode database. +fn nd_digit(c: char) -> bool { + let v = c as u32; + matches!(v, + 0x0660..=0x0669 // Arabic-Indic + | 0x06F0..=0x06F9 // Extended Arabic-Indic + | 0x07C0..=0x07C9 // NKo + | 0x0966..=0x096F // Devanagari + | 0x09E6..=0x09EF // Bengali + | 0x0A66..=0x0A6F // Gurmukhi + | 0x0AE6..=0x0AEF // Gujarati + | 0x0B66..=0x0B6F // Oriya + | 0x0BE6..=0x0BEF // Tamil + | 0x0C66..=0x0C6F // Telugu + | 0x0CE6..=0x0CEF // Kannada + | 0x0D66..=0x0D6F // Malayalam + | 0x0E50..=0x0E59 // Thai + | 0x0ED0..=0x0ED9 // Lao + | 0x0F20..=0x0F29 // Tibetan + | 0xFF10..=0xFF19 // Fullwidth + ) +} + +fn uni_space(ch: u32) -> bool { + match char::from_u32(ch) { + Some(c) => c.is_whitespace(), + None => false, + } +} + +fn uni_word(ch: u32) -> bool { + if ch == u32::from(b'_') { + return true; + } + match char::from_u32(ch) { + Some(c) => c.is_alphanumeric(), + None => false, + } +} + +fn uni_linebreak(ch: u32) -> bool { + matches!( + ch, + 0x0a | 0x0b | 0x0c | 0x0d | 0x1c | 0x1d | 0x1e | 0x85 | 0x2028 | 0x2029 + ) +} + +fn category(chcode: u32, ch: u32) -> bool { + match chcode { + CAT_DIGIT => ascii_digit(ch), + CAT_NOT_DIGIT => !ascii_digit(ch), + CAT_SPACE => ascii_space(ch), + CAT_NOT_SPACE => !ascii_space(ch), + CAT_WORD => ascii_word(ch), + CAT_NOT_WORD => !ascii_word(ch), + CAT_LINEBREAK => is_linebreak(ch), + CAT_NOT_LINEBREAK => !is_linebreak(ch), + CAT_LOC_WORD => loc_word(ch), + CAT_LOC_NOT_WORD => !loc_word(ch), + CAT_UNI_DIGIT => uni_digit(ch), + CAT_UNI_NOT_DIGIT => !uni_digit(ch), + CAT_UNI_SPACE => uni_space(ch), + CAT_UNI_NOT_SPACE => !uni_space(ch), + CAT_UNI_WORD => uni_word(ch), + CAT_UNI_NOT_WORD => !uni_word(ch), + CAT_UNI_LINEBREAK => uni_linebreak(ch), + CAT_UNI_NOT_LINEBREAK => !uni_linebreak(ch), + _ => false, + } +} + +// --------------------------------------------------------------------------- +// The matcher +// --------------------------------------------------------------------------- + +#[derive(Clone)] +struct MarkSnapshot { + marks: Vec, + lastmark: isize, + lastindex: isize, +} + +struct RepeatCtx { + count: isize, + /// Index in `code` of the REPEAT op's first argument (skip slot). + pattern: usize, + last_ptr: isize, + prev: Option, +} + +struct Matcher<'a> { + s: &'a [u32], + code: &'a [u32], + beginning: usize, + start: usize, + end: usize, + ptr: usize, + marks: Vec, + lastmark: isize, + lastindex: isize, + must_advance: bool, + match_all: bool, + repeats: Vec, + cur_repeat: Option, + depth: u32, +} + +impl<'a> Matcher<'a> { + fn new(s: &'a [u32], code: &'a [u32], groups: usize) -> Self { + Matcher { + s, + code, + beginning: 0, + start: 0, + end: s.len(), + ptr: 0, + marks: vec![-1; groups * 2], + lastmark: -1, + lastindex: -1, + must_advance: false, + match_all: false, + repeats: Vec::new(), + cur_repeat: None, + depth: 0, + } + } + + fn reset_capture(&mut self) { + for m in self.marks.iter_mut() { + *m = -1; + } + self.lastmark = -1; + self.lastindex = -1; + } + + #[inline] + fn snapshot(&self) -> MarkSnapshot { + MarkSnapshot { + marks: self.marks.clone(), + lastmark: self.lastmark, + lastindex: self.lastindex, + } + } + + #[inline] + fn restore(&mut self, snap: &MarkSnapshot) { + self.marks.clone_from(&snap.marks); + self.lastmark = snap.lastmark; + self.lastindex = snap.lastindex; + } + + fn at(&self, ptr: usize, atcode: u32) -> bool { + let s = self.s; + match atcode { + AT_BEGINNING | AT_BEGINNING_STRING => ptr == self.beginning, + AT_BEGINNING_LINE => ptr == self.beginning || is_linebreak(s[ptr - 1]), + AT_END => (self.end - ptr == 1 && is_linebreak(s[ptr])) || ptr == self.end, + AT_END_LINE => ptr == self.end || is_linebreak(s[ptr]), + AT_END_STRING => ptr == self.end, + AT_BOUNDARY => self.word_boundary(ptr, ascii_word), + AT_NON_BOUNDARY => !self.word_boundary(ptr, ascii_word), + AT_LOC_BOUNDARY => self.word_boundary(ptr, loc_word), + AT_LOC_NON_BOUNDARY => !self.word_boundary(ptr, loc_word), + AT_UNI_BOUNDARY => self.word_boundary(ptr, uni_word), + AT_UNI_NON_BOUNDARY => !self.word_boundary(ptr, uni_word), + _ => false, + } + } + + #[inline] + fn word_boundary(&self, ptr: usize, is_word: fn(u32) -> bool) -> bool { + if self.beginning == self.end { + return false; + } + let thatp = ptr > self.beginning && is_word(self.s[ptr - 1]); + let thisp = ptr < self.end && is_word(self.s[ptr]); + thisp != thatp + } + + /// `SRE(charset)` — is `ch` a member of the set starting at `set`? + fn charset(&self, mut set: usize, ch: u32) -> bool { + let code = self.code; + let mut ok = true; + loop { + let op = code[set]; + set += 1; + match op { + OP_FAILURE => return !ok, + OP_LITERAL => { + if ch == code[set] { + return ok; + } + set += 1; + } + OP_CATEGORY => { + if category(code[set], ch) { + return ok; + } + set += 1; + } + OP_CHARSET => { + // + if ch < 256 && (code[set + (ch / 32) as usize] & (1u32 << (ch & 31))) != 0 { + return ok; + } + set += 8; + } + OP_RANGE => { + if code[set] <= ch && ch <= code[set + 1] { + return ok; + } + set += 2; + } + OP_RANGE_UNI_IGNORE => { + if code[set] <= ch && ch <= code[set + 1] { + return ok; + } + let uch = upper_unicode(ch); + if code[set] <= uch && uch <= code[set + 1] { + return ok; + } + set += 2; + } + OP_NEGATE => ok = !ok, + OP_BIGCHARSET => { + // <256 block-indices as bytes + // packed into 64 words> + let count = code[set] as usize; + set += 1; + let block: i64 = if ch < 0x10000 { + // 256 indices stored as bytes, little/native order + // inside u32 words. + let byte_index = (ch >> 8) as usize; + let word = code[set + byte_index / 4]; + i64::from((word >> ((byte_index % 4) * 8)) & 0xff) + } else { + -1 + }; + set += 64; + if block >= 0 { + let block = block as usize; + let bit = (block * 256 + (ch as usize & 255)) as u32; + if (code[set + (bit / 32) as usize] & (1u32 << (bit & 31))) != 0 { + return ok; + } + } + set += count * 8; + } + _ => return false, + } + } + } + + fn charset_loc_ignore(&self, set: usize, ch: u32) -> bool { + let lo = lower_locale(ch); + if self.charset(set, lo) { + return true; + } + let up = upper_locale(ch); + up != lo && self.charset(set, up) + } + + /// `SRE(count)` — count repeated single-character matches of the + /// item at `pat`, starting at `self.ptr`, up to `maxcount`. + fn count(&mut self, pat: usize, maxcount: u32) -> Result { + let code = self.code; + let ptr = self.ptr; + let mut end = self.end; + if maxcount != MAXREPEAT && (maxcount as usize) < end - ptr { + end = ptr + maxcount as usize; + } + let s = self.s; + let op = code[pat]; + let counted = match op { + OP_IN => { + let mut p = ptr; + while p < end && self.charset(pat + 2, s[p]) { + p += 1; + } + p - ptr + } + OP_ANY => { + let mut p = ptr; + while p < end && !is_linebreak(s[p]) { + p += 1; + } + p - ptr + } + OP_ANY_ALL => end - ptr, + OP_LITERAL => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && s[p] == chr { + p += 1; + } + p - ptr + } + OP_NOT_LITERAL => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && s[p] != chr { + p += 1; + } + p - ptr + } + OP_LITERAL_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && lower_ascii(s[p]) == chr { + p += 1; + } + p - ptr + } + OP_LITERAL_UNI_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && lower_unicode(s[p]) == chr { + p += 1; + } + p - ptr + } + OP_LITERAL_LOC_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && char_loc_ignore(chr, s[p]) { + p += 1; + } + p - ptr + } + OP_NOT_LITERAL_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && lower_ascii(s[p]) != chr { + p += 1; + } + p - ptr + } + OP_NOT_LITERAL_UNI_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && lower_unicode(s[p]) != chr { + p += 1; + } + p - ptr + } + OP_NOT_LITERAL_LOC_IGNORE => { + let chr = code[pat + 1]; + let mut p = ptr; + while p < end && !char_loc_ignore(chr, s[p]) { + p += 1; + } + p - ptr + } + _ => { + // General case: repeatedly match the subpattern. + self.ptr = ptr; + while self.ptr < end { + let matched = self.do_match(pat, false)?; + if !matched { + break; + } + } + let n = self.ptr - ptr; + self.ptr = ptr; + return Ok(n); + } + }; + self.ptr = ptr; + Ok(counted) + } + + /// `SRE(match)` — try to match the pattern at `pat` against the + /// string starting at `self.ptr`. Returns whether it matched; on + /// success `self.ptr` holds the end position. + fn do_match(&mut self, pat: usize, toplevel: bool) -> Result { + self.depth += 1; + if self.depth > MAX_DEPTH { + self.depth -= 1; + return Err(runtime_error( + "internal: regular expression recursion limit exceeded", + )); + } + let r = self.do_match_inner(pat, toplevel); + self.depth -= 1; + r + } + + fn do_match_inner(&mut self, mut pat: usize, toplevel: bool) -> Result { + let code = self.code; + let mut ptr = self.ptr; + let end = self.end; + + // Optimization info block at the head of the (sub)pattern. + if code[pat] == OP_INFO { + let min = code[pat + 3] as usize; + if min != 0 && end - ptr < min { + return Ok(false); + } + pat += code[pat + 1] as usize + 1; + } + + loop { + let op = code[pat]; + pat += 1; + match op { + OP_MARK => { + let i = code[pat] as usize; + let ii = i as isize; + if i & 1 != 0 { + self.lastindex = (i / 2 + 1) as isize; + } + if ii > self.lastmark { + let mut j = self.lastmark + 1; + while j < ii { + self.marks[j as usize] = -1; + j += 1; + } + self.lastmark = ii; + } + self.marks[i] = ptr as isize; + pat += 1; + } + OP_LITERAL => { + if ptr >= end || self.s[ptr] != code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_NOT_LITERAL => { + if ptr >= end || self.s[ptr] == code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_SUCCESS => { + if toplevel + && ((self.match_all && ptr != self.end) + || (self.must_advance && ptr == self.start)) + { + return Ok(false); + } + self.ptr = ptr; + return Ok(true); + } + OP_AT => { + if !self.at(ptr, code[pat]) { + return Ok(false); + } + pat += 1; + } + OP_CATEGORY => { + if ptr >= end || !category(code[pat], self.s[ptr]) { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_ANY => { + if ptr >= end || is_linebreak(self.s[ptr]) { + return Ok(false); + } + ptr += 1; + } + OP_ANY_ALL => { + if ptr >= end { + return Ok(false); + } + ptr += 1; + } + OP_IN => { + if ptr >= end || !self.charset(pat + 1, self.s[ptr]) { + return Ok(false); + } + pat += code[pat] as usize; + ptr += 1; + } + OP_LITERAL_IGNORE => { + if ptr >= end || lower_ascii(self.s[ptr]) != code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_LITERAL_UNI_IGNORE => { + if ptr >= end || lower_unicode(self.s[ptr]) != code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_LITERAL_LOC_IGNORE => { + if ptr >= end || !char_loc_ignore(code[pat], self.s[ptr]) { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_NOT_LITERAL_IGNORE => { + if ptr >= end || lower_ascii(self.s[ptr]) == code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_NOT_LITERAL_UNI_IGNORE => { + if ptr >= end || lower_unicode(self.s[ptr]) == code[pat] { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_NOT_LITERAL_LOC_IGNORE => { + if ptr >= end || char_loc_ignore(code[pat], self.s[ptr]) { + return Ok(false); + } + pat += 1; + ptr += 1; + } + OP_IN_IGNORE => { + if ptr >= end || !self.charset(pat + 1, lower_ascii(self.s[ptr])) { + return Ok(false); + } + pat += code[pat] as usize; + ptr += 1; + } + OP_IN_UNI_IGNORE => { + if ptr >= end || !self.charset(pat + 1, lower_unicode(self.s[ptr])) { + return Ok(false); + } + pat += code[pat] as usize; + ptr += 1; + } + OP_IN_LOC_IGNORE => { + if ptr >= end || !self.charset_loc_ignore(pat + 1, self.s[ptr]) { + return Ok(false); + } + pat += code[pat] as usize; + ptr += 1; + } + OP_JUMP | OP_INFO => { + pat += code[pat] as usize; + } + OP_BRANCH => { + let save = self.snapshot(); + while code[pat] != 0 { + // Fast skip when the branch can't possibly match. + if code[pat + 1] == OP_LITERAL + && (ptr >= end || self.s[ptr] != code[pat + 2]) + { + pat += code[pat] as usize; + continue; + } + if code[pat + 1] == OP_IN + && (ptr >= end || !self.charset(pat + 3, self.s[ptr])) + { + pat += code[pat] as usize; + continue; + } + self.ptr = ptr; + // Each alternative flows through its trailing JUMP to + // the BRANCH tail and on to the final SUCCESS, so it + // inherits `toplevel` (CPython `DO_JUMP`) — otherwise + // the `must_advance`/`match_all` guards are skipped for + // top-level alternations (e.g. `fullmatch('a|ab','ab')`). + if self.do_match(pat + 1, toplevel)? { + return Ok(true); + } + self.restore(&save); + pat += code[pat] as usize; + } + return Ok(false); + } + OP_REPEAT_ONE => { + let skip = code[pat] as usize; + let pmin = code[pat + 1] as usize; + let pmax = code[pat + 2]; + if pmin > end - ptr { + return Ok(false); + } + self.ptr = ptr; + let cnt0 = self.count(pat + 3, pmax)?; + let mut cnt = cnt0 as isize; + if (cnt as usize) < pmin { + return Ok(false); + } + let tail = pat + skip; + let after = ptr + cnt as usize; + if code[tail] == OP_SUCCESS + && after == self.end + && !(toplevel && self.must_advance && after == self.start) + { + self.ptr = after; + return Ok(true); + } + let save = self.snapshot(); + let orig = ptr; + let pmin_i = pmin as isize; + if code[tail] == OP_LITERAL { + let chr = code[tail + 1]; + loop { + while cnt >= pmin_i && { + let pos = orig + cnt as usize; + pos >= end || self.s[pos] != chr + } { + cnt -= 1; + } + if cnt < pmin_i { + break; + } + let pos = orig + cnt as usize; + self.ptr = pos; + // The tail is the continuation of *this* match, + // so it inherits `toplevel` (CPython `DO_JUMP`) + // — otherwise the trailing SUCCESS would skip the + // empty-match / `must_advance` guard and the + // scanner could loop on a zero-width match. + if self.do_match(tail, toplevel)? { + return Ok(true); + } + self.restore(&save); + cnt -= 1; + } + } else { + while cnt >= pmin_i { + let pos = orig + cnt as usize; + self.ptr = pos; + if self.do_match(tail, toplevel)? { + return Ok(true); + } + self.restore(&save); + cnt -= 1; + } + } + return Ok(false); + } + OP_MIN_REPEAT_ONE => { + let skip = code[pat] as usize; + let pmin = code[pat + 1] as usize; + let pmax = code[pat + 2]; + if pmin > end - ptr { + return Ok(false); + } + self.ptr = ptr; + let mut cnt: isize = 0; + if pmin != 0 { + let r = self.count(pat + 3, code[pat + 1])?; + if r < pmin { + return Ok(false); + } + cnt = r as isize; + ptr += cnt as usize; + } + let tail = pat + skip; + if code[tail] == OP_SUCCESS + && !(toplevel + && ((self.match_all && ptr != self.end) + || (self.must_advance && ptr == self.start))) + { + self.ptr = ptr; + return Ok(true); + } + let save = self.snapshot(); + loop { + if !(pmax == MAXREPEAT || (cnt as u32) <= pmax) { + break; + } + self.ptr = ptr; + if self.do_match(tail, toplevel)? { + return Ok(true); + } + self.restore(&save); + self.ptr = ptr; + let r = self.count(pat + 3, 1)?; + if r == 0 { + break; + } + ptr += 1; + cnt += 1; + } + return Ok(false); + } + OP_POSSESSIVE_REPEAT_ONE => { + let skip = code[pat] as usize; + let pmin = code[pat + 1] as usize; + let pmax = code[pat + 2]; + if ptr + pmin > end { + return Ok(false); + } + self.ptr = ptr; + let cnt = self.count(pat + 3, pmax)?; + ptr += cnt; + if cnt < pmin { + return Ok(false); + } + pat += skip; + if code[pat] == OP_SUCCESS + && ptr == self.end + && !(toplevel && self.must_advance && ptr == self.start) + { + self.ptr = ptr; + return Ok(true); + } + // Evaluate the tail in this same frame. + } + OP_REPEAT => { + let skip = code[pat] as usize; + let rep = RepeatCtx { + count: -1, + pattern: pat, + last_ptr: -1, + prev: self.cur_repeat, + }; + let idx = self.repeats.len(); + self.repeats.push(rep); + self.cur_repeat = Some(idx); + self.ptr = ptr; + // The MAX_UNTIL/MIN_UNTIL operator (reached via `pat+skip`) + // ultimately continues to the pattern tail and SUCCESS, so + // it inherits `toplevel` (CPython `DO_JUMP`). Forcing it to + // `false` would skip the `must_advance` guard and let the + // scanner loop forever on a zero-width repeat such as + // `(a)*` over an empty match. + let r = self.do_match(pat + skip, toplevel); + self.cur_repeat = self.repeats[idx].prev; + self.repeats.truncate(idx); + return r; + } + OP_MAX_UNTIL => { + let idx = self + .cur_repeat + .ok_or_else(|| runtime_error("internal: MAX_UNTIL without REPEAT"))?; + self.ptr = ptr; + let count = self.repeats[idx].count + 1; + let rpat = self.repeats[idx].pattern; + let rmin = code[rpat + 1] as isize; + let rmax = code[rpat + 2]; + let item = rpat + 3; + if count < rmin { + self.repeats[idx].count = count; + self.ptr = ptr; + // Repeated-item matches inherit `toplevel` (CPython + // `DO_JUMP` for JUMP_MAX_UNTIL_1/_2): when the item can + // match empty (e.g. `(a?)*`), the recursion bottoms out + // at the tail SUCCESS, which must still see the + // `must_advance`/`match_all` guards. + if self.do_match(item, toplevel)? { + return Ok(true); + } + self.repeats[idx].count = count - 1; + self.ptr = ptr; + return Ok(false); + } + if (count < rmax as isize || rmax == MAXREPEAT) + && (ptr as isize) != self.repeats[idx].last_ptr + { + self.repeats[idx].count = count; + let save = self.snapshot(); + let saved_last = self.repeats[idx].last_ptr; + self.repeats[idx].last_ptr = ptr as isize; + self.ptr = ptr; + if self.do_match(item, toplevel)? { + return Ok(true); + } + self.repeats[idx].last_ptr = saved_last; + self.restore(&save); + self.repeats[idx].count = count - 1; + self.ptr = ptr; + } + let prev = self.repeats[idx].prev; + self.cur_repeat = prev; + self.ptr = ptr; + // Tail continuation inherits `toplevel` (CPython + // `DO_JUMP`) so the trailing SUCCESS still honours the + // `must_advance`/`match_all` guards. + let r = self.do_match(pat, toplevel)?; + self.cur_repeat = Some(idx); + if r { + return Ok(true); + } + self.ptr = ptr; + return Ok(false); + } + OP_MIN_UNTIL => { + let idx = self + .cur_repeat + .ok_or_else(|| runtime_error("internal: MIN_UNTIL without REPEAT"))?; + self.ptr = ptr; + let count = self.repeats[idx].count + 1; + let rpat = self.repeats[idx].pattern; + let rmin = code[rpat + 1] as isize; + let rmax = code[rpat + 2]; + let item = rpat + 3; + if count < rmin { + self.repeats[idx].count = count; + self.ptr = ptr; + // Inherit `toplevel` (CPython `DO_JUMP` JUMP_MIN_UNTIL_1). + if self.do_match(item, toplevel)? { + return Ok(true); + } + self.repeats[idx].count = count - 1; + self.ptr = ptr; + return Ok(false); + } + let prev = self.repeats[idx].prev; + let save = self.snapshot(); + self.cur_repeat = prev; + self.ptr = ptr; + let r = self.do_match(pat, toplevel)?; + self.cur_repeat = Some(idx); + if r { + return Ok(true); + } + self.restore(&save); + self.ptr = ptr; + if (count >= rmax as isize && rmax != MAXREPEAT) + || (ptr as isize) == self.repeats[idx].last_ptr + { + return Ok(false); + } + self.repeats[idx].count = count; + let saved_last = self.repeats[idx].last_ptr; + self.repeats[idx].last_ptr = ptr as isize; + self.ptr = ptr; + // Inherit `toplevel` (CPython `DO_JUMP` JUMP_MIN_UNTIL_3). + if self.do_match(item, toplevel)? { + return Ok(true); + } + self.repeats[idx].last_ptr = saved_last; + self.repeats[idx].count = count - 1; + self.ptr = ptr; + return Ok(false); + } + OP_POSSESSIVE_REPEAT => { + let skip = code[pat] as usize; + let pmin = code[pat + 1] as usize; + let pmax = code[pat + 2]; + self.ptr = ptr; + let rep = RepeatCtx { + count: -1, + pattern: usize::MAX, + last_ptr: -1, + prev: self.cur_repeat, + }; + let idx = self.repeats.len(); + self.repeats.push(rep); + self.cur_repeat = Some(idx); + let body = pat + 3; + let mut cnt: usize = 0; + let mut failed = false; + while cnt < pmin { + if self.do_match(body, false)? { + cnt += 1; + } else { + failed = true; + break; + } + } + if failed { + self.ptr = ptr; + self.cur_repeat = self.repeats[idx].prev; + self.repeats.truncate(idx); + return Ok(false); + } + let mut prev_ptr: Option = None; + loop { + let can_more = (pmax == MAXREPEAT || (cnt as u32) < pmax) + && Some(self.ptr) != prev_ptr; + if !can_more { + break; + } + let save = self.snapshot(); + prev_ptr = Some(self.ptr); + if self.do_match(body, false)? { + cnt += 1; + } else { + self.restore(&save); + self.ptr = prev_ptr.unwrap(); + break; + } + } + self.cur_repeat = self.repeats[idx].prev; + self.repeats.truncate(idx); + pat += skip + 1; + ptr = self.ptr; + continue; + } + OP_ATOMIC_GROUP => { + let skip = code[pat] as usize; + self.ptr = ptr; + if self.do_match(pat + 1, false)? { + pat += skip; + ptr = self.ptr; + } else { + self.ptr = ptr; + return Ok(false); + } + } + OP_GROUPREF => { + if !self.groupref_match(pat, GroupRefKind::Exact, end, ptr, &mut ptr) { + return Ok(false); + } + pat += 1; + } + OP_GROUPREF_IGNORE => { + if !self.groupref_match(pat, GroupRefKind::Ascii, end, ptr, &mut ptr) { + return Ok(false); + } + pat += 1; + } + OP_GROUPREF_UNI_IGNORE => { + if !self.groupref_match(pat, GroupRefKind::Unicode, end, ptr, &mut ptr) { + return Ok(false); + } + pat += 1; + } + OP_GROUPREF_LOC_IGNORE => { + if !self.groupref_match(pat, GroupRefKind::Locale, end, ptr, &mut ptr) { + return Ok(false); + } + pat += 1; + } + OP_GROUPREF_EXISTS => { + let g = code[pat] as usize; + let skip = code[pat + 1] as usize; + let groupref = (g * 2) as isize; + let set = if groupref >= self.lastmark { + false + } else { + let p = self.marks[groupref as usize]; + let e = self.marks[groupref as usize + 1]; + !(p < 0 || e < 0 || e < p) + }; + if set { + pat += 2; + } else { + pat += skip; + } + } + OP_ASSERT => { + let skip = code[pat] as usize; + let back = code[pat + 1] as usize; + if ptr - self.beginning < back { + return Ok(false); + } + self.ptr = ptr - back; + if !self.do_match(pat + 2, false)? { + return Ok(false); + } + pat += skip; + } + OP_ASSERT_NOT => { + let skip = code[pat] as usize; + let back = code[pat + 1] as usize; + if ptr - self.beginning >= back { + self.ptr = ptr - back; + let save = self.snapshot(); + let matched = self.do_match(pat + 2, false)?; + self.restore(&save); + if matched { + return Ok(false); + } + } + pat += skip; + } + OP_FAILURE => return Ok(false), + _ => { + return Err(value_error(format!( + "internal: unsupported sre opcode {op}" + ))); + } + } + } + } + + fn groupref_match( + &self, + pat: usize, + kind: GroupRefKind, + end: usize, + start_ptr: usize, + ptr_out: &mut usize, + ) -> bool { + let g = self.code[pat] as usize; + let groupref = (g * 2) as isize; + if groupref >= self.lastmark { + return false; + } + let p0 = self.marks[groupref as usize]; + let e0 = self.marks[groupref as usize + 1]; + if p0 < 0 || e0 < 0 || e0 < p0 { + return false; + } + let mut p = p0 as usize; + let e = e0 as usize; + let mut ptr = start_ptr; + while p < e { + if ptr >= end { + return false; + } + let a = self.s[ptr]; + let b = self.s[p]; + let eq = match kind { + GroupRefKind::Exact => a == b, + GroupRefKind::Ascii => lower_ascii(a) == lower_ascii(b), + GroupRefKind::Unicode => lower_unicode(a) == lower_unicode(b), + GroupRefKind::Locale => lower_locale(a) == lower_locale(b), + }; + if !eq { + return false; + } + p += 1; + ptr += 1; + } + *ptr_out = ptr; + true + } + + /// `SRE(search)` — scan for the leftmost match at or after + /// `self.start`. Returns the start position of the match on success. + fn search(&mut self) -> Result, RuntimeError> { + // Determine where the real pattern starts (after any INFO block) + // for the anchored-pattern fast reject. + let mut p = 0usize; + let mut min = 0usize; + if self.code[0] == OP_INFO { + min = self.code[3] as usize; + p = 1 + self.code[1] as usize; + } + let anchored = self.code.get(p) == Some(&OP_AT) + && matches!( + self.code.get(p + 1).copied(), + Some(AT_BEGINNING) | Some(AT_BEGINNING_STRING) + ); + + let mut ptr = self.start; + let mut first = true; + loop { + if min != 0 && self.end.saturating_sub(ptr) < min { + return Ok(None); + } + self.start = ptr; + self.ptr = ptr; + self.reset_capture(); + let matched = self.do_match(0, true)?; + if first { + self.must_advance = false; + first = false; + } + if matched { + return Ok(Some(ptr)); + } + if anchored { + return Ok(None); + } + if ptr >= self.end { + return Ok(None); + } + ptr += 1; + } + } +} + +#[derive(Clone, Copy)] +enum GroupRefKind { + Exact, + Ascii, + Unicode, + Locale, +} + +// --------------------------------------------------------------------------- +// Module functions +// --------------------------------------------------------------------------- + +fn arg_i64(args: &[Object], i: usize, name: &str) -> Result { + args.get(i) + .and_then(|o| o.as_i64()) + .ok_or_else(|| type_error(format!("_sre: expected int for {name}"))) +} + +/// Read a Python sequence of small ints into a `Vec`. +fn codeseq_to_vec(obj: &Object) -> Result, RuntimeError> { + let collect = |items: &[Object]| -> Result, RuntimeError> { + let mut out = Vec::with_capacity(items.len()); + for it in items { + let v = it + .as_i64() + .ok_or_else(|| type_error("_sre.compile: code must be a sequence of ints"))?; + if !(0..=i64::from(u32::MAX)).contains(&v) { + return Err(value_error("_sre.compile: code value out of range")); + } + out.push(v as u32); + } + Ok(out) + }; + match obj { + Object::List(l) => collect(&l.borrow()), + Object::Tuple(t) => collect(t), + _ => Err(type_error("_sre.compile: code must be a list or tuple")), + } +} + +/// Decode the subject into code points (str) or byte values (bytes). +fn subject_to_vec(obj: &Object) -> Result, RuntimeError> { + match obj { + Object::Str(s) => Ok(s.chars().map(|c| c as u32).collect()), + Object::Bytes(b) => Ok(b.iter().map(|&x| u32::from(x)).collect()), + Object::ByteArray(b) => Ok(b.borrow().iter().map(|&x| u32::from(x)).collect()), + _ => Err(type_error("expected string or bytes-like object")), + } +} + +/// `_sre.compile(code, groups)` → an integer handle into the registry. +fn sre_compile(args: &[Object]) -> Result { + let code = codeseq_to_vec( + args.first() + .ok_or_else(|| type_error("_sre.compile: code"))?, + )?; + let groups = arg_i64(args, 1, "groups")?.max(0) as usize; + let handle = REGISTRY.with(|reg| { + let mut reg = reg.borrow_mut(); + reg.push(Rc::new(CompiledCode { code, groups })); + reg.len() - 1 + }); + Ok(Object::Int(handle as i64)) +} + +/// `_sre.exec(handle, string, pos, endpos, mode, must_advance)`. +/// +/// Returns `None` on no match, otherwise a tuple +/// `(start, end, lastindex, marks)` where `marks` is a tuple of +/// `2 * groups` code-point indices (`-1` for an unset group). +fn sre_exec(args: &[Object]) -> Result { + let handle = arg_i64(args, 0, "handle")? as usize; + let cc = REGISTRY.with(|reg| reg.borrow().get(handle).cloned()); + let cc = cc.ok_or_else(|| value_error("_sre.exec: invalid pattern handle"))?; + let subject = subject_to_vec(args.get(1).ok_or_else(|| type_error("_sre.exec: string"))?)?; + let slen = subject.len() as i64; + let pos = arg_i64(args, 2, "pos")?.clamp(0, slen) as usize; + let endpos = arg_i64(args, 3, "endpos")?.clamp(0, slen) as usize; + let mode = arg_i64(args, 4, "mode")?; + let must_advance = args + .get(5) + .map(|o| o.as_i64().unwrap_or(0) != 0) + .unwrap_or(false); + + if pos > endpos { + return Ok(Object::None); + } + + let mut m = Matcher::new(&subject, &cc.code, cc.groups); + m.end = endpos; + m.start = pos; + m.ptr = pos; + m.must_advance = must_advance; + + let (mstart, ok) = match mode { + // 1 = match (anchored at pos) + 1 => { + let r = m.do_match(0, true)?; + (pos, r) + } + // 2 = fullmatch (anchored + must reach endpos) + 2 => { + m.match_all = true; + let r = m.do_match(0, true)?; + (pos, r) + } + // 0 = search + _ => match m.search()? { + Some(s) => (s, true), + None => (0, false), + }, + }; + + if !ok { + return Ok(Object::None); + } + + let mend = m.ptr; + let mut marks_out: Vec = Vec::with_capacity(cc.groups * 2); + for i in 0..cc.groups * 2 { + let v = if (i as isize) <= m.lastmark { + m.marks[i] + } else { + -1 + }; + marks_out.push(Object::Int(v as i64)); + } + Ok(Object::new_tuple(vec![ + Object::Int(mstart as i64), + Object::Int(mend as i64), + Object::Int(m.lastindex as i64), + Object::new_tuple(marks_out), + ])) +} + +fn sre_ascii_tolower(args: &[Object]) -> Result { + Ok(Object::Int( + i64::from(lower_ascii(arg_i64(args, 0, "ch")? as u32)) + )) +} +fn sre_ascii_iscased(args: &[Object]) -> Result { + Ok(Object::Bool(ascii_iscased(arg_i64(args, 0, "ch")? as u32))) +} +fn sre_unicode_tolower(args: &[Object]) -> Result { + Ok(Object::Int( + i64::from(lower_unicode(arg_i64(args, 0, "ch")? as u32)) + )) +} +fn sre_unicode_iscased(args: &[Object]) -> Result { + Ok(Object::Bool( + unicode_iscased(arg_i64(args, 0, "ch")? as u32), + )) +} +fn sre_getcodesize(_args: &[Object]) -> Result { + Ok(Object::Int(CODESIZE)) +} +/// `_sre.getlower(ch, flags)`. +fn sre_getlower(args: &[Object]) -> Result { + let ch = arg_i64(args, 0, "ch")? as u32; + let flags = arg_i64(args, 1, "flags").unwrap_or(0); + // SRE_FLAG_LOCALE = 4, SRE_FLAG_UNICODE = 32 + let lowered = if flags & 4 != 0 { + lower_locale(ch) + } else if flags & 32 != 0 { + lower_unicode(ch) + } else { + lower_ascii(ch) + }; + Ok(Object::Int(i64::from(lowered))) +} + +fn b(name: &'static str, body: fn(&[Object]) -> Result) -> Object { + Object::Builtin(Rc::new(BuiltinFn { + name, + call: Box::new(body), + call_kw: None, + })) +} + +pub fn build(_cache: &ModuleCache) -> Rc { + let dict = Rc::new(RefCell::new(DictData::new())); + { + let mut d = dict.borrow_mut(); + d.insert( + DictKey(Object::from_static("__name__")), + Object::from_static("_sre"), + ); + d.insert( + DictKey(Object::from_static("__doc__")), + Object::from_static("WeavePy native SRE regular-expression core (RFC 0035)."), + ); + d.insert(DictKey(Object::from_static("MAGIC")), Object::Int(MAGIC)); + d.insert( + DictKey(Object::from_static("CODESIZE")), + Object::Int(CODESIZE), + ); + d.insert( + DictKey(Object::from_static("MAXREPEAT")), + Object::Int(MAXREPEAT_I64), + ); + d.insert( + DictKey(Object::from_static("MAXGROUPS")), + Object::Int(MAXGROUPS), + ); + d.insert( + DictKey(Object::from_static("compile")), + b("compile", sre_compile), + ); + d.insert(DictKey(Object::from_static("exec")), b("exec", sre_exec)); + d.insert( + DictKey(Object::from_static("ascii_tolower")), + b("ascii_tolower", sre_ascii_tolower), + ); + d.insert( + DictKey(Object::from_static("ascii_iscased")), + b("ascii_iscased", sre_ascii_iscased), + ); + d.insert( + DictKey(Object::from_static("unicode_tolower")), + b("unicode_tolower", sre_unicode_tolower), + ); + d.insert( + DictKey(Object::from_static("unicode_iscased")), + b("unicode_iscased", sre_unicode_iscased), + ); + d.insert( + DictKey(Object::from_static("getcodesize")), + b("getcodesize", sre_getcodesize), + ); + d.insert( + DictKey(Object::from_static("getlower")), + b("getlower", sre_getlower), + ); + } + Rc::new(PyModule { + name: "_sre".to_owned(), + filename: None, + dict, + }) +} diff --git a/crates/weavepy-vm/src/stdlib/thread_real.rs b/crates/weavepy-vm/src/stdlib/thread_real.rs index a54dfd4..16cea09 100644 --- a/crates/weavepy-vm/src/stdlib/thread_real.rs +++ b/crates/weavepy-vm/src/stdlib/thread_real.rs @@ -277,6 +277,7 @@ fn make_lock_object(lock: Arc) -> Object { let inst = Rc::new(PyInstance { class: lock_type(), dict, + native: None, }); Object::Instance(inst) } @@ -376,6 +377,7 @@ fn make_rlock_object(rlock: Arc) -> Object { let inst = Rc::new(PyInstance { class: rlock_type(), dict, + native: None, }); Object::Instance(inst) } diff --git a/crates/weavepy-vm/src/stdlib/weakref_real.rs b/crates/weavepy-vm/src/stdlib/weakref_real.rs index 7f8b485..8d4511d 100644 --- a/crates/weavepy-vm/src/stdlib/weakref_real.rs +++ b/crates/weavepy-vm/src/stdlib/weakref_real.rs @@ -304,7 +304,11 @@ fn make_ref_object(target: Object, callback: Option, kind_tag: u8) -> Ob ); } - Object::Instance(Rc::new(PyInstance { class, dict })) + Object::Instance(Rc::new(PyInstance { + class, + dict, + native: None, + })) } /// `_weakref.getweakrefcount(obj)` — number of live weakrefs diff --git a/crates/weavepy-vm/src/types.rs b/crates/weavepy-vm/src/types.rs index 29800b6..75fbc41 100644 --- a/crates/weavepy-vm/src/types.rs +++ b/crates/weavepy-vm/src/types.rs @@ -221,6 +221,15 @@ fn compute_c3( pub struct PyInstance { pub class: Rc, pub dict: Rc>, + /// For instances of a subclass of an immutable built-in + /// (`int`, `str`, `float`, `bytes`, `tuple`, …) this holds the + /// underlying primitive value the instance *is* — the moral + /// equivalent of CPython storing the C-level value in the object + /// struct. `None` for ordinary objects. Set once at construction + /// (the wrapped builtins are themselves immutable) and unwrapped + /// by the numeric / comparison / hashing / conversion fast paths + /// so e.g. `class C(int)` instances behave like real ints. + pub native: Option, } impl PyInstance { @@ -228,6 +237,17 @@ impl PyInstance { Self { class, dict: Rc::new(RefCell::new(DictData::new())), + native: None, + } + } + + /// Build an instance that wraps a primitive `native` value + /// (subclass of `int`/`str`/…). + pub fn with_native(class: Rc, native: Object) -> Self { + Self { + class, + dict: Rc::new(RefCell::new(DictData::new())), + native: Some(native), } } } diff --git a/docs/rfcs/0035-faithful-re-sre-unicode.md b/docs/rfcs/0035-faithful-re-sre-unicode.md new file mode 100644 index 0000000..0e38bd3 --- /dev/null +++ b/docs/rfcs/0035-faithful-re-sre-unicode.md @@ -0,0 +1,435 @@ +# RFC 0035: A faithful `re`/`_sre` engine — porting CPython's secret-labs matcher, with the Unicode and `%`-formatting fidelity it demands + +- **Status**: Accepted +- **Authors**: WeavePy authors +- **Created**: 2026-05-31 +- **Tracking issue**: TBD +- **Builds on**: RFC 0012 (modules/imports + frozen stdlib), RFC 0015 + (object-model completion), RFC 0020 (real-Python frozen stdlib), + RFC 0023 (drop-in parity), RFC 0030 (pure-Python drop-in), RFC 0034 + (the CPython test suite as a live harness — `test_re.py` is exactly + the kind of file it was built to run) + +## Summary + +WeavePy's `re` module was, until this RFC, a **translation layer**: a +native Rust shim (`stdlib/re.rs`) that parsed a subset of Python's +regex syntax and forwarded it to the Rust [`regex`] and [`fancy_regex`] +crates. That bought us working `re.match`/`search`/`findall` for the +common cases quickly, but it was a parallel implementation of a +notoriously corner-heavy language. Anywhere CPython's behaviour is +defined by *the secret-labs engine itself* — backtracking order, the +exact group-reset semantics on alternation, zero-width-match +bookkeeping, `\b` at the bytes/str boundary, the precise text of a +`re.error`, `Pattern`/`Match` repr and attribute surface, the +`re.Scanner` undocumented-but-tested API — a re-implementation can only +approximate, and the approximations are exactly what `Lib/test/test_re.py` +exists to catch. + +This RFC replaces the shim with **CPython's own engine**: + +1. A **native `_sre` core** (`crates/weavepy-vm/src/stdlib/sre_mod.rs`) + — a faithful port of `Modules/_sre/sre_lib.h`'s backtracking VM + (`SRE(match)`, `SRE(search)`, `SRE(count)`, `SRE(charset)`), the + case-folding/character-classification primitives, and the module + surface real code touches (`compile`, the compiled-pattern `exec` + trampoline, `ascii_tolower`/`unicode_tolower`, + `ascii_iscased`/`unicode_iscased`, `getlower`, `getcodesize`, plus + `MAGIC`/`CODESIZE`). Compiled programs live in a thread-local + registry keyed by an integer handle, so the Rust core stays free of + Python-object lifetime concerns. +2. The **frozen Python `re` package**, ported from CPython 3.13: + `re/__init__.py`, `re._constants`, `re._casefix`, `re._parser`, + `re._compiler`, all essentially verbatim, plus a small + `re._engine` that builds the `Pattern`/`Match` classes on top of + the native core (CPython builds those in C; we build them in frozen + Python over a minimal native primitive — see *Detailed design §3*). + The pre-3.11 deprecated aliases `sre_constants`/`sre_parse`/ + `sre_compile` are shipped as re-export shims. +3. The **Unicode, `str`/`bytes`, and `%`-formatting fidelity** that the + real `re` parser/compiler turned out to depend on — and which the + shim had hidden. Porting CPython's own `_parser.py` surfaced a tail + of interpreter gaps (`int` subclassing, slice deletion/assignment, + the legacy `__getitem__` iteration protocol, faithful `repr()` + printability, `str(bytes, encoding)`, `\U` escapes, `%`-format + dunder dispatch). Each is a general correctness fix that happened to + be *forced into the light* by running CPython's code unmodified. + +Diff shape: **~6K lines added** — the `_sre` Rust core (~1.6K), the five +frozen `re` submodules + three alias shims (~3K, mostly *CPython's own +Python* carried verbatim), the interpreter/object-model fixes (~1K +across the VM/compiler/parser), `tests/regrtest/test_re.py`, the +module-registry rewiring, and this RFC — against ~1.1K deleted with the +old `stdlib/re.rs` shim (and its VM-level `re.sub`-callable hook, +`do_re_sub_callable`), for a net diff of **~5K LOC**. + +That the *fidelity* upgrade is also a *smaller* footprint is the whole +argument: a faithful port reuses CPython's ~3K-line Python parser/ +compiler unchanged and reimplements only the ~1.6K-line C matcher, +where a from-scratch shim would have to grow toward the full corner-case +surface line by line and still never reach parity. Compactness here is +evidence the strategy is right, not that the scope is small. + +Mission alignment: `re` is one of the most-imported modules in the +stdlib, and `test_re.py` is one of CPython's largest single-module test +files. Running *CPython's engine* rather than an emulation of it is the +difference between "regex mostly works" and "regex is CPython." + +## Motivation + +The shim was a liability for three compounding reasons: + +- **It was a second implementation of a hard language.** Python's regex + dialect is not PCRE and is not the Rust `regex` crate's dialect. + Conditional groups `(?(id)yes|no)`, the exact semantics of + `\b`/`\B`/`\A`/`\Z`, possessive quantifiers and atomic groups + `(?>...)`, the group-state rollback on a failed alternation branch, + the rule that an empty match adjacent to a previous match is skipped + in `findall`/`finditer`/`sub` (`must_advance`), and the textual + content of every `re.error` ("nothing to repeat", "missing ), + unterminated subpattern", "redefinition of group name") are all + defined operationally by the secret-labs engine. `fancy_regex` makes + *different* choices, so any program that depends on Python's choices + silently diverged. +- **It could never run `test_re.py`.** RFC 0034 made CPython's own + `Lib/test/test_re.py` runnable in principle; the shim made it + unpassable in practice, because the test asserts on engine internals + (`sre_compile` output sizes, `Pattern.__repr__`, `re.error` line/col, + `Match.regs`, the `_sre.MAGIC`/`CODESIZE` contract that + `re._compiler` checks at import). +- **It diverged from the frozen-stdlib strategy.** RFC 0020's thesis is + "ship real CPython Python where we can." `re` is *the* poster child: + CPython's `re` is ~90% Python (`_parser`/`_compiler`/`__init__`) over + a ~10% C core (`_sre`). Porting the Python verbatim and reimplementing + only the C core is both less code and dramatically more faithful than + a from-scratch shim. + +The cost of inaction was an open-ended tail of "regex behaves subtly +differently" bugs — the worst kind, because they are silent. + +## CPython reference + +We track **CPython 3.13**. Specific sources ported or matched: + +- **The C core**: `Modules/_sre/sre.c`, `Modules/_sre/sre_lib.h`, + `Modules/_sre/sre_constants.h`, `Modules/_sre/sre.h`. The matcher + port mirrors the `SRE(match)` opcode dispatch loop, `SRE(search)`'s + prefix/charset fast-paths, `SRE(count)` for `REPEAT_ONE`/ + `MIN_REPEAT_ONE`, and `SRE(charset)`/`SRE(charset_loc_ignore)`. The + `_sre.MAGIC` constant (`20230612`) and `CODESIZE` (`4`) are the + contract `re._compiler` checks at import. +- **The Python layer**: `Lib/re/__init__.py`, `Lib/re/_constants.py`, + `Lib/re/_casefix.py`, `Lib/re/_parser.py`, `Lib/re/_compiler.py`. +- **The deprecated aliases**: `Lib/sre_constants.py`, `Lib/sre_parse.py`, + `Lib/sre_compile.py` (each a thin "moved to `re._*`" shim since 3.11). +- **The Unicode/format surface** the engine leans on: the `str`/`bytes` + data model (the language reference §3), `str.isprintable`/`repr` + (CPython `Objects/unicodeobject.c`'s `unicode_repr` and + `Py_UNICODE_ISPRINTABLE`), printf-style `%` formatting + (`PyUnicode_Format`), and the `str(object, encoding, errors)` + constructor form. +- **The acceptance test**: `Lib/test/test_re.py` (the parts that don't + require `_sre` C-detail refleak hooks). + +As with RFC 0034, anything the engine reaches for that we deliberately +do not model raises the *same* exception CPython would, so an +unsupported corner reads as the correct error, never a wrong answer. + +## Detailed design + +### 1 — the native `_sre` core (`stdlib/sre_mod.rs`) + +The native module exposes the minimal surface `re._compiler` and +`re._engine` import: + +| symbol | role | +|---|---| +| `MAGIC` = `20230612` | version stamp `re._compiler` asserts against `_constants.MAGIC` | +| `CODESIZE` = `4` | word size of the compiled program (`sizeof(SRE_CODE)`) | +| `compile(pattern, flags, code, groups, groupindex, indexgroup)` | intern a compiled program into the thread-local registry; returns an integer handle | +| `exec(handle, string, pos, endpos, …)` | run `SRE(search)`/`SRE(match)` and return the match groups (or `None`) | +| `ascii_tolower` / `unicode_tolower` | case-fold a single code point | +| `ascii_iscased` / `unicode_iscased` | "is this code point case-sensitive?" (drives `IGNORECASE`) | +| `getlower(ch, flags)` | the flag-aware lowercase used by `LITERAL_IGNORE` | +| `getcodesize()` | `CODESIZE`, as a function (the historical API) | + +**The matcher.** `SRE(match)` is a recursive backtracking interpreter +over the `u32` program emitted by `re._compiler`. The port keeps the +opcode numbering from `_constants.OPCODES` (so the *Python* compiler and +the *Rust* matcher agree by construction) and implements the full set +real patterns reach: `LITERAL`/`NOT_LITERAL`/`LITERAL_IGNORE`/ +`LITERAL_UNI_IGNORE`/`LITERAL_LOC_IGNORE`, `ANY`/`ANY_ALL`, `IN`/ +`IN_IGNORE`/`IN_UNI_IGNORE`/`IN_LOC_IGNORE` (delegating to +`SRE(charset)`), `BRANCH`, `REPEAT`/`MAX_UNTIL`/`MIN_UNTIL`, +`REPEAT_ONE`/`MIN_REPEAT_ONE` (with the `SRE(count)` fast path), +`GROUPREF`/`GROUPREF_IGNORE`/`GROUPREF_EXISTS`, `AT` (the +anchors/boundaries), `ASSERT`/`ASSERT_NOT` (look-around), +`MARK`, `JUMP`, `SUCCESS`, `FAILURE`, `INFO`, `ATOMIC_GROUP`/ +`POSSESSIVE_REPEAT`/`POSSESSIVE_REPEAT_ONE`. + +**Zero-width correctness — the `toplevel`/`must_advance` invariant.** +The single subtlest part of the port. CPython threads a `toplevel` flag +through `SRE(match)` and uses it (with the saved repeat mark) to refuse +a second *empty* iteration of a repeat or branch tail, which is what +keeps `re.findall(r'a*', 'aaa')`, `re.split(r'x*', 'axbxc')`, and +`re.sub(r'', '-', 'abc')` from looping forever or producing the wrong +split. The port reproduces this exactly: `OP_BRANCH`, `OP_REPEAT`, +`OP_MAX_UNTIL`, and `OP_MIN_UNTIL` **inherit** the caller's `toplevel` +into their tail continuations rather than forcing `false` (an early +draft hard-coded `false` and hung on `a{,3}` against `'aaaaa'`). The +`REPEAT_ONE` count loop uses signed arithmetic so a decrement past zero +can't wrap a `usize`. + +**Lifetime model.** The Rust matcher never holds a Python object across +a call. `compile` copies the `code: Vec` and the group metadata +into a thread-local `RegistryEntry` and returns its index as a plain +`int`; `exec` looks the entry up, runs against the subject string/bytes, +and returns owned results. This sidesteps the GC/borrow questions a +native `Pattern` object would raise and keeps the core a pure function +of `(program, subject, pos)`. + +### 2 — the frozen Python `re` package + +Registered as a frozen package with submodules (the model +`email`/`importlib` already use): + +``` +re (package) <- Lib/re/__init__.py (re_init.py) +re._constants (module) <- Lib/re/_constants.py (verbatim) +re._casefix (module) <- Lib/re/_casefix.py (verbatim) +re._parser (module) <- Lib/re/_parser.py (verbatim) +re._compiler (module) <- Lib/re/_compiler.py (≈verbatim) +re._engine (module) <- WeavePy-specific glue (Pattern/Match) +sre_constants (module) <- re-export shim (deprecated alias) +sre_parse (module) <- re-export shim (deprecated alias) +sre_compile (module) <- re-export shim (deprecated alias) +``` + +`_constants`, `_casefix`, and `_parser` are CPython 3.13 **verbatim** — +the whole point is to run CPython's parser, not ours. `_compiler` is +all-but-verbatim: the only adaptations are where it calls into the C +core (it gets `MAGIC`/`CODESIZE` from our native `_sre`, and its +`_bytes_to_codes` helper assembles the `array`-backed program the same +way, which is what surfaced the `int.byteorder`/`array` plumbing fixes +below). + +### 3 — `Pattern`/`Match` in frozen Python (`re._engine`) + +CPython implements `Pattern` and `Match` as **C types** in `_sre`. +Reproducing those as native Rust objects would mean a second object +type with its own GC integration, attribute table, and repr — a large +surface for little gain. Instead `re._engine` defines `Pattern` and +`Match` as **Python classes** over the native primitive: + +- `Pattern` holds the compiled handle, `pattern`/`flags`/`groups`/ + `groupindex`/`groupdict`, and implements `match`/`fullmatch`/`search`/ + `findall`/`finditer`/`split`/`sub`/`subn`/`scanner` by calling + `_sre.exec` and wrapping results. The scan loop (`_iter`) implements + the `must_advance` "skip empty match adjacent to previous" rule in + Python, mirroring `_sre`'s `scanner`/`Pattern.finditer`. +- `Match` exposes `group`/`groups`/`groupdict`/`start`/`end`/`span`/ + `expand`/`__getitem__`/`regs`/`pos`/`endpos`/`lastindex`/ + `lastgroup`/`re`/`string`, with CPython's `repr`. +- **Template expansion** (`re.sub`'s replacement parsing, `\g` + and `\1` back-references, and the *callable* `repl` path) lives here + in Python — which is why the old VM-level `do_re_sub_callable` + interception in `weavepy-vm/src/lib.rs` is **deleted**: a callable + `repl` is now just a Python call in `_engine`, exactly as in CPython. + +This "C core + Python wrapper class" split is the same shape CPython +*almost* has (its wrapper is C only for speed); functionally it is +indistinguishable to user code. + +### 4 — interpreter & object-model fixes surfaced by the port + +Running CPython's unmodified `_parser.py`/`_compiler.py` is a stress +test of the interpreter. Each gap below was a real CPython-behaviour +bug that the shim had simply never exercised; all are fixed as general +correctness work in `weavepy-vm`/`weavepy-compiler`/`weavepy-parser`: + +- **`int` subclassing.** `_parser` and `_constants` use + `_NamedIntConstant(int)` for opcodes, and `enum.IntFlag`/`IntEnum` + back the `re` flags. `PyInstance` gained a `native: Option` + slot; `object.__new__` initialises it for `int`/`float` subclasses; + and every arithmetic/identity/hash/ordering/truth path + (`as_i64`, `as_usize`, `eq_value`, `DictKey::hash`, `is_truthy`, + `Object::cmp`, `binary_op`, `binary_subscr`, the `int()`/`float()`/ + `bool()` constructors) now unwraps a subclass to its native value. + `enum.py`'s `IntEnum`/`IntFlag` were re-based on `(int, Enum)` / + `(int, Flag)` and member creation routed through `int.__new__`. +- **Slice deletion & assignment.** `_parser` does `del subpattern[x]` + and `subpattern[a:b] = …`. Added `del seq[slice]` for `list`/ + `bytearray`, slice-assignment from an arbitrary iterable RHS + (via the VM's `collect_iterable`), `range` slicing, and correct + negative-step (`[::-1]`) handling mirroring `PySlice_AdjustIndices`. +- **Legacy iteration protocol.** Objects with `__getitem__`+`__len__` + but no `__iter__` are now iterable (call `__getitem__(0,1,2,…)` until + `IndexError`), which `re`'s `SubPattern` relies on. +- **The ABA call-cache bug.** `MAKE_FUNCTION`'s inline cache could + mis-specialize when a freed function's `Rc` address was reused + (classic ABA). `CallPyExact`/`CallPyExactNoFree` now re-validate the + callee's closure shape and arg count before taking the fast path. +- **`bytes`/`bytearray` methods.** `translate`/`maketrans` implemented; + `find`/`rfind`/`index`/`count` now honour `start`/`end` (bytes + patterns go through these). +- **Truthiness dispatch.** A shared `obj_truthy` that dispatches + `__bool__` then `__len__` for instances, wired into `PopJumpIfFalse`/ + `PopJumpIfTrue`/`UnaryOp(not)`/`bool()` — without it `(?i)`-style + inline-flag parsing mis-fired. +- **Compiler import binding.** `collect_decls` now records names bound + by `import`/`from … import` so they can be captured as cellvars + (a closure in `_compiler` referenced an imported name). + +### 5 — Unicode, `str`/`bytes`, and `%`-formatting fidelity + +- **Faithful `repr()`.** `str.__repr__` now picks CPython's quote + (double quotes iff the string has a `'` and no `"`, else single) and + escapes non-printable code points as `\xNN`/`\uNNNN`/`\UNNNNNNNN`. + Printability is decided by Unicode general category via the + `unicode_properties` crate (`Cc`/`Cf`/`Cs`/`Co`/`Cn` and the + separators are non-printable; `U+0020` is the one printable space), + matching `Py_UNICODE_ISPRINTABLE`. `str.isprintable` shares the + helper. (`re.escape` and `Pattern.__repr__` both depend on this.) +- **`str(bytes, encoding[, errors])`.** The two/three-arg `str` + constructor now decodes via the codec machinery instead of returning + `repr(b'…')`. `re._parser.Tokenizer` builds itself from + `str(byte, 'latin-1')`, so without this every *bytes* pattern was + mis-tokenised. +- **`\U` escapes.** The lexer's string-literal decoder learned the + eight-hex-digit `\U` form (it already handled `\x`/`\u`), so non-BMP + literals like `'\U0001f600'` parse correctly. +- **`%`-format dunder dispatch.** `str.__mod__` (`"%s"/"%r" % obj`) now + dispatches `__str__`/`__repr__` for instances (so `"%s" % some_error` + prints the message, not ``), and `%d`/`%i`/`%u` + unwrap `int` subclasses (so `"%d" % OPCODES.LITERAL` formats the + value). Implemented by threading a VM-aware `resolve` callback into a + refactored `percent_format_with`. + +### 6 — module rewiring & test + +- `stdlib/mod.rs` registers `_sre` as a builtin native module and the + nine frozen sources above; the old native `re` registration and + `stdlib/re.rs` are removed. +- `weavepy-vm/src/lib.rs` drops `do_re_sub_callable` (now handled in + frozen `re._engine`). +- `tests/regrtest/test_re.py` is a new bundled fixture (auto-discovered + by the RFC 0034 harness) covering literals/quantifiers/groups/ + alternation/flags, look-around, back-references, named groups, + `split`/`sub`/`subn`/`findall`/`finditer`, bytes patterns, Unicode + categories, `re.error` text, and the zero-width edge cases that the + `toplevel` invariant protects. Every expectation was cross-checked + against the local CPython 3.13 oracle. + +## Implementation status (post-merge) + +| area | status | notes | +|------|--------|-------| +| native `_sre` matcher (`SRE(match)`/`search`/`count`/`charset`) | ✅ | full opcode set incl. look-around, back-refs, possessive/atomic, conditional groups | +| `_sre` module surface (`compile`/`exec`/case-fold/`MAGIC`/`CODESIZE`) | ✅ | thread-local compiled-program registry keyed by int handle | +| zero-width `toplevel`/`must_advance` invariant | ✅ | `BRANCH`/`REPEAT`/`MAX_UNTIL`/`MIN_UNTIL` inherit `toplevel`; signed `REPEAT_ONE` count | +| frozen `re` package (`__init__`/`_constants`/`_casefix`/`_parser`) | ✅ | CPython 3.13 verbatim | +| frozen `re._compiler` | ✅ | ≈verbatim; targets our native `_sre` `MAGIC`/`CODESIZE` | +| `re._engine` `Pattern`/`Match` + template expansion | ✅ | Python classes over the native core; callable `repl` is plain Python | +| deprecated `sre_constants`/`sre_parse`/`sre_compile` aliases | ✅ | re-export shims | +| removed: native `stdlib/re.rs` + `do_re_sub_callable` VM hook | ✅ | the shim and its VM interception are gone | +| `int`/`float` subclassing (`native` slot; arith/hash/cmp/truth) | ✅ | `enum.IntFlag`/`IntEnum`, `_NamedIntConstant` work | +| slice delete/assign, `range` slicing, negative step | ✅ | `del seq[slice]`, `seq[a:b]=iter`, `r[::-1]` | +| legacy `__getitem__` iteration protocol | ✅ | `__getitem__`+`__len__` without `__iter__` iterates | +| ABA inline-cache hardening (`MAKE_FUNCTION`/`CallPyExact*`) | ✅ | closure-shape + arg-count re-validation | +| `bytes`/`bytearray` `translate`/`maketrans`; `find`-family `start`/`end` | ✅ | bytes patterns rely on these | +| truthiness dispatch (`__bool__`/`__len__`) | ✅ | wired into jumps/`not`/`bool()` | +| faithful `repr()`/`isprintable` (Unicode general category) | ✅ | quote selection + `\xNN`/`\uNNNN`/`\UNNNNNNNN` escaping | +| `str(bytes, encoding[, errors])`; lexer `\U` escapes | ✅ | bytes patterns + non-BMP literals | +| `%`-format `__str__`/`__repr__` dispatch + int-subclass unwrap | ✅ | `"%s" % exc`, `"%d" % OPCODE` | +| bundled `tests/regrtest/test_re.py` | ✅ | passes under WeavePy and CPython 3.13 | + +## Drawbacks + +- **The matcher is recursive, like CPython's.** `SRE(match)` recurses + per opcode group; pathological patterns can hit the native stack + before Python's `sys.setrecursionlimit`. CPython has the same shape + (and the same class of failure); a future iterative/explicit-stack + rewrite is possible but out of scope. +- **Two languages for one module.** `re` is now Rust (`_sre`) + Python + (`re._*`). That is precisely CPython's split, but it means a bug can + live on either side of the FFI line; the saving grace is that the + Python side is *CPython's own code*, so bugs concentrate in the small + Rust core. +- **The surfaced fix tail was broad.** Half this RFC is interpreter + fixes (slicing, int-subclassing, repr, `%`) that are *not* about + regex. That is the nature of running real CPython code: it exercises + the whole object model. Those fixes are pure upside elsewhere, but + they widened the diff. +- **No native `Pattern`/`Match` type.** Code that introspects + `type(p).__module__ == '_sre'` or pickles a compiled pattern via the + C type's `__reduce__` sees our Python classes instead. The observable + attribute/method surface matches; the type identity does not. + +## Alternatives + +1. **Keep the `regex`/`fancy_regex` shim and paper over differences.** + Rejected: the differences are unbounded and silent, and `test_re.py` + asserts on engine internals a shim can't reproduce. Every patch + would be whack-a-mole against a foreign engine's choices. +2. **Port `_sre` *and* write `Pattern`/`Match` as native Rust types.** + More faithful to CPython's type identity, but a large second native + object surface (GC, attributes, repr, pickle) for marginal gain over + frozen-Python classes. Deferred to future work if type-identity + parity is ever required. +3. **Compile Python regex to the Rust `regex` crate's AST.** A + translation layer by another name — same fidelity ceiling as the + shim, plus a new impedance mismatch (no backtracking, different + group semantics). Rejected. +4. **A bytecode-level `re` fast path in the VM.** Premature: get + faithful first, optimise the hot `exec` loop later (see *Future + work*). + +## Prior art + +- **CPython** is the reference; we port its engine rather than imitate + it. The secret-labs engine (Fredrik Lundh) has been stable in shape + since Python 1.6, which is what makes a verbatim parser/compiler port + viable. +- **PyPy** reimplements `_sre` in RPython but keeps `_sre.py`'s + structure and the CPython `re` Python layer — the same "port the C + core, reuse the Python" strategy this RFC follows. +- **RustPython** ships a hand-written `sre-engine` Rust crate plus the + CPython Python layer — close to our approach; our matcher independently + arrives at the same `toplevel`/`must_advance` structure, which is good + corroboration that it's the load-bearing invariant. +- **GraalPy** runs CPython's `_sre` Python over a Truffle-based engine; + again, the Python layer is reused, not rewritten. + +The cross-implementation consensus — *reuse CPython's Python `re` +layer, reimplement only the C core* — is exactly what this RFC adopts. + +## Unresolved questions + +- **`localeconv`/`LOCALE`-flag fidelity.** `IN_LOC_IGNORE`/ + `CATEGORY_LOC_*` depend on the C locale; we implement the structure + but the locale tables are the byte locale only. Full locale parity is + deferred (CPython itself discourages `re.LOCALE` on str patterns). +- **Native-stack depth vs `sys.setrecursionlimit`.** Should the matcher + consult the Python recursion limit to raise `RecursionError` instead + of risking a native overflow on adversarial input? +- **Pickling compiled patterns.** Do we need `Pattern.__reduce__` to + round-trip through `re.compile(pattern, flags)` (CPython's approach) + before any real workload needs it? + +## Future work + +- **Optimise the `exec` hot loop.** The faithful matcher is correctness- + first; a charset-prefix fast path and a flattened dispatch (or a + Tier-2 JIT intrinsic per RFC 0032) can follow now that behaviour is + pinned by `test_re.py`. +- **Native `Pattern`/`Match` types** if/when `type` identity or C-level + pickling parity is required. +- **Wire the full `Lib/test/test_re.py`** (including the C-detail + refleak/`gc` hooks) into the RFC 0034 opt-in CPython sweep, not just + the bundled subset. +- **`regex`-module-style atomic-group/possessive optimisations** kept + behind CPython-compatible semantics. +- **Locale tables** for `re.LOCALE` parity on bytes patterns. + +[`regex`]: https://docs.rs/regex +[`fancy_regex`]: https://docs.rs/fancy-regex diff --git a/tests/regrtest/test_re.py b/tests/regrtest/test_re.py new file mode 100644 index 0000000..a98f410 --- /dev/null +++ b/tests/regrtest/test_re.py @@ -0,0 +1,116 @@ +"""Regression coverage for the faithful ``re`` / ``_sre`` engine. + +Exercises the CPython-ported backtracking matcher: quantifiers, +groups, backreferences, look-around, alternation, flags, Unicode and +bytes patterns, plus the zero-width scanning behaviour that previously +looped forever. All expectations were diffed against CPython 3.13. +""" + +import re + +# --- basic matching / search ------------------------------------------ +assert re.match("abc", "abcdef").span() == (0, 3) +assert re.search("cd", "abcdef").span() == (2, 4) +assert re.match("abc", "xabc") is None +assert re.fullmatch("a.c", "abc") is not None +assert re.fullmatch("a.c", "abcd") is None + +# --- quantifiers ------------------------------------------------------- +assert re.findall(r"a{2,4}", "a aa aaa aaaa aaaaa") == ["aa", "aaa", "aaaa", "aaaa"] +assert re.findall(r"a{,3}", "aaaaa") == ["aaa", "aa", ""] +assert re.findall(r"<.+>", "") == [""] # greedy +assert re.findall(r"<.+?>", "") == ["", ""] # lazy +assert re.search(r"a.*c", "abcabc").span() == (0, 6) +assert re.search(r"a.*?c", "abcabc").span() == (0, 3) + +# --- alternation / groups --------------------------------------------- +assert re.fullmatch("a|ab", "ab").group(0) == "ab" # toplevel branch guard +assert re.match(r"(a)(b)(c)", "abc").groups() == ("a", "b", "c") +assert re.match(r"(a)(b)?(c)", "ac").groups() == ("a", None, "c") +assert re.match(r"(a)(b)?(c)", "ac").groups("X") == ("a", "X", "c") +m = re.match(r"(?P\d{4})-(?P\d{2})", "2026-05") +assert m.groupdict() == {"y": "2026", "m": "05"} +assert m["y"] == "2026" and m.group("m") == "05" +assert m.lastgroup == "m" and m.lastindex == 2 + +# --- backreferences ---------------------------------------------------- +assert re.findall(r"(\w)\1", "aa bb cd ee") == ["a", "b", "e"] +assert re.search(r"(?P['\"]).*?(?P=q)", "say 'hi' done").group(0) == "'hi'" +assert re.findall(r"<(\w+)>.*?", "xy") == ["b", "i"] + +# --- look-around ------------------------------------------------------- +assert re.findall(r"\d+(?= dollars)", "100 dollars, 50 cents") == ["100"] +assert re.findall(r"\d+(?! dollars)", "100 dollars 50 cents") == ["10", "50"] +assert re.findall(r"(?<=\$)\d+", "$100 and 50") == ["100"] +assert re.findall(r"(?\w+)", r"[\g]", "hi there") == "[hi] [there]" +assert re.subn(r"\d+", "#", "a1b22c333") == ("a#b#c#", 3) +assert re.sub(r"a", "b", "aaaa", count=2) == "bbaa" +assert re.sub(r"\d+", lambda mo: str(int(mo.group()) * 2), "1 2 3") == "2 4 6" +assert re.match(r"(\w+) (\w+)", "John Smith").expand(r"\2 \1") == "Smith John" + +# --- flags ------------------------------------------------------------- +assert re.findall(r"abc", "ABC abc", re.I) == ["ABC", "abc"] +assert re.findall(r"(?i)abc", "ABC abc") == ["ABC", "abc"] +assert re.findall(r"(?i:ab)c", "ABc abc ABC") == ["ABc", "abc"] +assert re.findall(r"^\w+", "foo\nbar\nbaz", re.M) == ["foo", "bar", "baz"] +assert re.findall(r"a.b", "a\nb", re.S) == ["a\nb"] +assert re.findall(r"""\d + # int + \. # dot + \d * # frac""", "3.14 x", re.X) == ["3.14"] + +# --- unicode vs ascii -------------------------------------------------- +assert re.findall(r"\w+", "café déjà") == ["café", "déjà"] +assert re.findall(r"\w+", "café", re.A) == ["caf"] +assert re.findall(r"\d+", "\uff11\uff12 99") == ["\uff11\uff12", "99"] # fullwidth +assert re.match(r"(?i)\u00e9", "\u00c9") is not None # é ~ É +assert re.findall(r"\s", "a b\tc\u00a0d") == [" ", "\t", "\u00a0"] + +# --- bytes patterns ---------------------------------------------------- +assert re.findall(rb"\d+", b"a12b345") == [b"12", b"345"] +assert re.sub(rb"\s+", b"_", b"a b\tc") == b"a_b_c" +assert re.match(rb"(\w+)@(\w+)", b"user@host").groups() == (b"user", b"host") +assert re.split(rb"[,;]", b"a,b;c") == [b"a", b"b", b"c"] +assert re.findall(rb"[\x00-\x02]", bytes(range(5))) == [b"\x00", b"\x01", b"\x02"] + +# --- possessive / atomic ---------------------------------------------- +assert re.search(r"(?>a+)b", "aaab") is not None +assert re.search(r"(?>a+)a", "aaa") is None # atomic: no give-back +assert re.findall(r"a*+", "aaab") == ["aaa", "", ""] + +# --- escape / error semantics ----------------------------------------- +assert re.escape("a.b*c+d?") == r"a\.b\*c\+d\?" +for bad, msg in [ + (r"(?Pa)(?Pb)", "redefinition"), + (r"a{2,1}", "min repeat greater than max repeat"), + (r"(?P=undef)", "unknown group name"), + (r"[", "unterminated character set"), + (r"a\1", "invalid group reference"), +]: + try: + re.compile(bad) + except re.error as e: + assert msg in str(e), (bad, str(e)) + else: + raise AssertionError("expected re.error for %r" % bad) + +# --- compiled Pattern surface ----------------------------------------- +p = re.compile(r"(\d+)") +assert p.pattern == r"(\d+)" and p.groups == 1 +assert [mo.group(1) for mo in p.finditer("a1b22c")] == ["1", "22"] +assert isinstance(re.match(r"x", "x").re, re.Pattern) + +print("ok") From 481759bc1cb5f6d03182ee906083042a8844917f Mon Sep 17 00:00:00 2001 From: Owen Carey <37121709+owenthcarey@users.noreply.github.com> Date: Sun, 31 May 2026 16:37:35 -0700 Subject: [PATCH 2/2] style: format sre_mod with rustfmt --- crates/weavepy-vm/src/stdlib/sre_mod.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/weavepy-vm/src/stdlib/sre_mod.rs b/crates/weavepy-vm/src/stdlib/sre_mod.rs index deb821a..429f29c 100644 --- a/crates/weavepy-vm/src/stdlib/sre_mod.rs +++ b/crates/weavepy-vm/src/stdlib/sre_mod.rs @@ -196,7 +196,8 @@ fn unicode_iscased(ch: u32) -> bool { } fn ascii_iscased(ch: u32) -> bool { - (u32::from(b'a')..=u32::from(b'z')).contains(&ch) || (u32::from(b'A')..=u32::from(b'Z')).contains(&ch) + (u32::from(b'a')..=u32::from(b'z')).contains(&ch) + || (u32::from(b'A')..=u32::from(b'Z')).contains(&ch) } // --------------------------------------------------------------------------- @@ -1451,17 +1452,17 @@ fn sre_exec(args: &[Object]) -> Result { } fn sre_ascii_tolower(args: &[Object]) -> Result { - Ok(Object::Int( - i64::from(lower_ascii(arg_i64(args, 0, "ch")? as u32)) - )) + Ok(Object::Int(i64::from(lower_ascii( + arg_i64(args, 0, "ch")? as u32 + )))) } fn sre_ascii_iscased(args: &[Object]) -> Result { Ok(Object::Bool(ascii_iscased(arg_i64(args, 0, "ch")? as u32))) } fn sre_unicode_tolower(args: &[Object]) -> Result { - Ok(Object::Int( - i64::from(lower_unicode(arg_i64(args, 0, "ch")? as u32)) - )) + Ok(Object::Int(i64::from(lower_unicode( + arg_i64(args, 0, "ch")? as u32, + )))) } fn sre_unicode_iscased(args: &[Object]) -> Result { Ok(Object::Bool(