From da8aad9a087add501eb340b889cae94d01959f11 Mon Sep 17 00:00:00 2001 From: Daniel Frankcom Date: Tue, 24 Mar 2026 19:49:21 +0000 Subject: [PATCH 1/2] Add string operator $indexOf*/$strLen* and integration tests Signed-off-by: Daniel Frankcom --- .../test_indexOfBytes_encoding.py | 352 +++++++++++++ .../test_indexOfBytes_index_types.py | 312 ++++++++++++ .../test_indexOfBytes_invalid_args.py | 208 ++++++++ .../indexOfBytes/test_indexOfBytes_null.py | 108 ++++ .../indexOfBytes/test_indexOfBytes_search.py | 313 ++++++++++++ .../test_indexOfBytes_size_limit.py | 88 ++++ .../test_indexOfBytes_type_errors.py | 471 +++++++++++++++++ .../indexOfBytes/test_indexOfBytes_usage.py | 134 +++++ .../string/indexOfBytes/utils/__init__.py | 0 .../indexOfBytes/utils/indexOfBytes_common.py | 16 + .../indexOfCP/test_indexOfCP_encoding.py | 294 +++++++++++ .../indexOfCP/test_indexOfCP_index_types.py | 317 ++++++++++++ .../indexOfCP/test_indexOfCP_invalid_args.py | 212 ++++++++ .../string/indexOfCP/test_indexOfCP_null.py | 105 ++++ .../string/indexOfCP/test_indexOfCP_search.py | 286 +++++++++++ .../indexOfCP/test_indexOfCP_size_limit.py | 86 ++++ .../indexOfCP/test_indexOfCP_type_errors.py | 474 ++++++++++++++++++ .../string/indexOfCP/test_indexOfCP_usage.py | 135 +++++ .../string/indexOfCP/utils/__init__.py | 0 .../indexOfCP/utils/indexOfCP_common.py | 16 + .../test_strLenBytes_byte_counts.py | 305 +++++++++++ .../test_strLenBytes_input_forms.py | 133 +++++ .../test_strLenBytes_invalid_args.py | 99 ++++ .../test_strLenBytes_invariants.py | 124 +++++ .../test_strLenBytes_size_limit.py | 53 ++ .../test_strLenBytes_type_errors.py | 206 ++++++++ .../string/strLenBytes/utils/__init__.py | 0 .../strLenBytes/utils/strLenBytes_common.py | 20 + .../strLenCP/test_strLenCP_code_points.py | 299 +++++++++++ .../strLenCP/test_strLenCP_input_forms.py | 129 +++++ .../strLenCP/test_strLenCP_invalid_args.py | 101 ++++ .../strLenCP/test_strLenCP_invariants.py | 118 +++++ .../strLenCP/test_strLenCP_size_limit.py | 53 ++ .../strLenCP/test_strLenCP_type_errors.py | 204 ++++++++ .../string/strLenCP/utils/__init__.py | 0 .../string/strLenCP/utils/strLenCP_common.py | 20 + .../string/test_string_combination.py | 255 ++++++++++ 37 files changed, 6046 insertions(+) create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/indexOfBytes_common.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/indexOfCP_common.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/strLenBytes_common.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/strLenCP_common.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py new file mode 100644 index 00000000..96891a5c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py @@ -0,0 +1,352 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Encoding]: the operator matches and indexes by raw UTF-8 byte sequences without +# Unicode normalization, and the result is a byte index rather than a codepoint index. +INDEXOFBYTES_ENCODING_TESTS: list[IndexOfBytesTest] = [ + # Character after 2-byte é (U+00E9): byte index 2, codepoint index 1. + IndexOfBytesTest( + "encoding_after_2byte", + args=["éa", "a"], + expected=2, + msg="$indexOfBytes should return byte index 2 after 2-byte character", + ), + # Character after 3-byte 日 (U+65E5): byte index 3, codepoint index 1. + IndexOfBytesTest( + "encoding_after_3byte", + args=["日a", "a"], + expected=3, + msg="$indexOfBytes should return byte index 3 after 3-byte character", + ), + # Character after 4-byte 🎉 (U+1F389): byte index 4, codepoint index 1. + IndexOfBytesTest( + "encoding_after_4byte", + args=["🎉a", "a"], + expected=4, + msg="$indexOfBytes should return byte index 4 after 4-byte character", + ), + IndexOfBytesTest( + "encoding_find_multibyte_substr", + args=["café", "é"], + expected=3, + msg="$indexOfBytes should find 2-byte substring at correct byte offset", + ), + # Decomposed é ("e" U+0065 + combining acute U+0301) found in a decomposed string. + IndexOfBytesTest( + "encoding_find_decomposed_in_decomposed", + args=["cafe\u0301", "e\u0301"], + expected=3, + msg="$indexOfBytes should find decomposed sequence in decomposed string", + ), + # ASCII "e" does not match precomposed é (U+00E9). + IndexOfBytesTest( + "encoding_ascii_vs_precomposed", + args=["café", "e"], + expected=-1, + msg="$indexOfBytes should not match ASCII 'e' against precomposed é", + ), + # Decomposed é is "e" (U+0065) + combining acute (U+0301). "e" matches the e byte. + IndexOfBytesTest( + "encoding_ascii_in_decomposed", + args=["cafe\u0301", "e"], + expected=3, + msg="$indexOfBytes should find ASCII 'e' in decomposed é sequence", + ), + # Precomposed é (U+00E9) and decomposed e + combining acute (U+0301) are distinct byte + # sequences. + IndexOfBytesTest( + "encoding_precomposed_in_decomposed", + args=["cafe\u0301", "\u00e9"], + expected=-1, + msg="$indexOfBytes should not find precomposed é in decomposed string", + ), + IndexOfBytesTest( + "encoding_decomposed_in_precomposed", + args=["caf\u00e9", "e\u0301"], + expected=-1, + msg="$indexOfBytes should not find decomposed é in precomposed string", + ), + # start=1 falls in the middle of 2-byte é; partial bytes are skipped, "a" found at byte 2. + IndexOfBytesTest( + "encoding_start_mid_2byte", + args=["éab", "a", 1], + expected=2, + msg="$indexOfBytes should skip partial 2-byte char when start is mid-character", + ), + # start=1 falls in the middle of 3-byte 日; "a" found at byte 3. + IndexOfBytesTest( + "encoding_start_mid_3byte", + args=["日a", "a", 1], + expected=3, + msg="$indexOfBytes should skip partial 3-byte char when start is mid-character", + ), + # start=1 falls in the middle of 4-byte 🎉; "a" found at byte 4. + IndexOfBytesTest( + "encoding_start_mid_4byte", + args=["🎉a", "a", 1], + expected=4, + msg="$indexOfBytes should skip partial 4-byte char when start is mid-character", + ), + # é spans bytes 1-2, but end=2 excludes byte 2 so the match is partial. + IndexOfBytesTest( + "encoding_end_mid_2byte", + args=["aé", "é", 0, 2], + expected=-1, + msg="$indexOfBytes should return -1 when end splits a 2-byte character", + ), + # 日 spans bytes 1-3, but end=2 splits it. + IndexOfBytesTest( + "encoding_end_mid_3byte", + args=["a日", "日", 0, 2], + expected=-1, + msg="$indexOfBytes should return -1 when end splits a 3-byte character", + ), + # 🎉 spans bytes 1-4, but end=2 splits it. + IndexOfBytesTest( + "encoding_end_mid_4byte", + args=["a🎉", "🎉", 0, 2], + expected=-1, + msg="$indexOfBytes should return -1 when end splits a 4-byte character", + ), + # Start and end parameters are byte offsets; 寿司 is two 3-byte characters (6 bytes total). + IndexOfBytesTest( + "encoding_multibyte_start", + args=["寿司", "司", 3], + expected=3, + msg="$indexOfBytes should find 3-byte char at byte offset with start", + ), + IndexOfBytesTest( + "encoding_multibyte_start_end_excludes", + args=["寿司", "司", 3, 3], + expected=-1, + msg="$indexOfBytes should return -1 when end excludes match in multi-byte string", + ), + IndexOfBytesTest( + "encoding_multibyte_start_end_includes", + args=["寿司", "司", 3, 6], + expected=3, + msg="$indexOfBytes should find match when end includes it in multi-byte string", + ), + IndexOfBytesTest( + "encoding_multibyte_start_past_match", + args=["寿司", "司", 6], + expected=-1, + msg="$indexOfBytes should return -1 when start is past match in multi-byte string", + ), + # Null bytes are single-byte characters and do not terminate the string. + IndexOfBytesTest( + "encoding_null_byte_found", + args=["a\x00b", "\x00"], + expected=1, + msg="$indexOfBytes should find embedded null byte", + ), + IndexOfBytesTest( + "encoding_after_null_byte", + args=["a\x00b", "b"], + expected=2, + msg="$indexOfBytes should find character after embedded null byte", + ), + IndexOfBytesTest( + "encoding_null_byte_with_start", + args=["a\x00a\x00", "\x00", 2], + expected=3, + msg="$indexOfBytes should find second null byte when start skips first", + ), + IndexOfBytesTest( + "encoding_null_byte_with_start_end", + args=["\x00a\x00ab", "\x00", 2, 3], + expected=2, + msg="$indexOfBytes should find null byte within start/end range", + ), + # Control characters U+0001 and U+001F are single-byte and findable. + IndexOfBytesTest( + "encoding_control_soh", + args=["a\x01b", "\x01"], + expected=1, + msg="$indexOfBytes should find SOH control character", + ), + IndexOfBytesTest( + "encoding_control_us", + args=["a\x1fb", "\x1f"], + expected=1, + msg="$indexOfBytes should find US control character", + ), + # Whitespace characters are findable as substrings. + IndexOfBytesTest( + "encoding_tab", + args=["a\tb", "\t"], + expected=1, + msg="$indexOfBytes should find tab character", + ), + IndexOfBytesTest( + "encoding_newline", + args=["a\nb", "\n"], + expected=1, + msg="$indexOfBytes should find newline character", + ), + IndexOfBytesTest( + "encoding_cr", + args=["a\rb", "\r"], + expected=1, + msg="$indexOfBytes should find carriage return character", + ), + # U+00A0 non-breaking space (2 bytes) is distinct from ASCII space. + IndexOfBytesTest( + "encoding_nbsp_found", + args=["a\u00a0b", "\u00a0"], + expected=1, + msg="$indexOfBytes should find non-breaking space", + ), + IndexOfBytesTest( + "encoding_nbsp_vs_space", + args=["a\u00a0b", " "], + expected=-1, + msg="$indexOfBytes should not match ASCII space against NBSP", + ), + IndexOfBytesTest( + "encoding_space_vs_nbsp", + args=["a b", "\u00a0"], + expected=-1, + msg="$indexOfBytes should not match NBSP against ASCII space", + ), + # U+2000 en space (3 bytes) is distinct from ASCII space. + IndexOfBytesTest( + "encoding_en_space_found", + args=["a\u2000b", "\u2000"], + expected=1, + msg="$indexOfBytes should find en space", + ), + IndexOfBytesTest( + "encoding_en_space_vs_space", + args=["a\u2000b", " "], + expected=-1, + msg="$indexOfBytes should not match ASCII space against en space", + ), + # Zero-width characters are findable at their byte offsets. + # U+FEFF BOM (3 bytes) at byte 5 in "hello\ufeff". + IndexOfBytesTest( + "encoding_bom_found", + args=["hello\ufeff", "\ufeff"], + expected=5, + msg="$indexOfBytes should find BOM at correct byte offset", + ), + # U+200B ZWSP (3 bytes) at byte 1 in "a\u200b". + IndexOfBytesTest( + "encoding_zwsp_found", + args=["a\u200b", "\u200b"], + expected=1, + msg="$indexOfBytes should find zero-width space", + ), + # U+0301 combining acute (2 bytes) is findable within a combining sequence at byte 4. + IndexOfBytesTest( + "encoding_combining_mark_alone", + args=["cafe\u0301", "\u0301"], + expected=4, + msg="$indexOfBytes should find combining mark alone at byte offset", + ), + # é (U+00E9, UTF-8: C3 A9) and à (U+00C3, UTF-8: C3 83) share leading UTF-8 byte 0xC3 but + # are not cross-matched. + IndexOfBytesTest( + "encoding_no_leading_byte_cross", + args=["\u00e9", "\u00c3"], + expected=-1, + msg="$indexOfBytes should not cross-match chars sharing a leading UTF-8 byte", + ), + IndexOfBytesTest( + "encoding_no_leading_byte_cross_rev", + args=["\u00c3", "\u00e9"], + expected=-1, + msg="$indexOfBytes should not cross-match chars sharing a leading UTF-8 byte (reversed)", + ), +] + +# Property [Case Sensitivity]: search is strictly case-sensitive with no Unicode case folding, +# ligature expansion, or locale-dependent mapping. +INDEXOFBYTES_CASE_SENSITIVITY_TESTS: list[IndexOfBytesTest] = [ + # ASCII case differences. + IndexOfBytesTest( + "case_ascii_upper_in_lower", + args=["hello", "E"], + expected=-1, + msg="$indexOfBytes should not find uppercase in lowercase ASCII", + ), + IndexOfBytesTest( + "case_ascii_lower_in_upper", + args=["HELLO", "e"], + expected=-1, + msg="$indexOfBytes should not find lowercase in uppercase ASCII", + ), + # Greek sigma: uppercase U+03A3 vs lowercase U+03C3. + IndexOfBytesTest( + "case_greek_upper_sigma", + args=["σ", "Σ"], + expected=-1, + msg="$indexOfBytes should not case-fold Greek sigma", + ), + IndexOfBytesTest( + "case_greek_lower_sigma", + args=["Σ", "σ"], + expected=-1, + msg="$indexOfBytes should not case-fold Greek uppercase sigma", + ), + # Cyrillic: uppercase U+0414 vs lowercase U+0434. + IndexOfBytesTest( + "case_cyrillic_upper_de", + args=["д", "Д"], + expected=-1, + msg="$indexOfBytes should not case-fold Cyrillic de", + ), + IndexOfBytesTest( + "case_cyrillic_lower_de", + args=["Д", "д"], + expected=-1, + msg="$indexOfBytes should not case-fold Cyrillic uppercase de", + ), + # No sharp-s expansion: U+00DF does not match "SS" or "ss". + IndexOfBytesTest( + "case_sharp_s_vs_upper_ss", + args=["ß", "SS"], + expected=-1, + msg="$indexOfBytes should not expand sharp-s to SS", + ), + IndexOfBytesTest( + "case_sharp_s_vs_lower_ss", + args=["ß", "ss"], + expected=-1, + msg="$indexOfBytes should not expand sharp-s to ss", + ), + # No ligature expansion: U+FB01 does not match "fi". + IndexOfBytesTest( + "case_ligature_fi", + args=["\ufb01", "fi"], + expected=-1, + msg="$indexOfBytes should not expand fi ligature", + ), + # No locale-dependent mapping: U+0131 (dotless i) does not match "i". + IndexOfBytesTest( + "case_dotless_i", + args=["\u0131", "i"], + expected=-1, + msg="$indexOfBytes should not map dotless i to ASCII i", + ), +] + + +INDEXOFBYTES_ENCODING_ALL_TESTS = INDEXOFBYTES_ENCODING_TESTS + INDEXOFBYTES_CASE_SENSITIVITY_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_ENCODING_ALL_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py new file mode 100644 index 00000000..fa4a85fe --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import pytest +from bson import Decimal128, Int64 + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import INDEXOF_INDEX_TYPE_ERROR, INDEXOF_NEGATIVE_INDEX_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_NEGATIVE_ZERO, + DECIMAL128_ZERO, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_OVERFLOW, + INT32_UNDERFLOW, + INT64_ZERO, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Index Type Acceptance]: integral Decimal128, Int64, and whole-number floats are accepted +# for start and end. +INDEXOFBYTES_INDEX_TYPE_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "idx_type_float_start", + args=["hello", "lo", 3.0], + expected=3, + msg="$indexOfBytes should accept whole-number float as start", + ), + IndexOfBytesTest( + "idx_type_float_start_and_end", + args=["hello", "lo", DOUBLE_ZERO, 5.0], + expected=3, + msg="$indexOfBytes should accept whole-number floats as start and end", + ), + IndexOfBytesTest( + "idx_type_decimal128_start", + args=["hello", "lo", Decimal128("3")], + expected=3, + msg="$indexOfBytes should accept Decimal128 as start", + ), + IndexOfBytesTest( + "idx_type_decimal128_start_and_end", + args=["hello", "lo", DECIMAL128_ZERO, Decimal128("5")], + expected=3, + msg="$indexOfBytes should accept Decimal128 as start and end", + ), + IndexOfBytesTest( + "idx_type_int64_start", + args=["hello", "lo", Int64(3)], + expected=3, + msg="$indexOfBytes should accept Int64 as start", + ), + IndexOfBytesTest( + "idx_type_int64_start_and_end", + args=["hello", "lo", INT64_ZERO, Int64(5)], + expected=3, + msg="$indexOfBytes should accept Int64 as start and end", + ), +] + +# Property [Numeric Edge Cases]: negative zero is treated as 0, and Decimal128 with trailing zeros +# or exponent notation resolving to an integer is accepted. +INDEXOFBYTES_NUMERIC_EDGE_SUCCESS_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "num_edge_neg_zero_float_start", + args=["hello", "h", DOUBLE_NEGATIVE_ZERO], + expected=0, + msg="$indexOfBytes should treat -0.0 float start as 0", + ), + IndexOfBytesTest( + "num_edge_neg_zero_decimal_start", + args=["hello", "h", DECIMAL128_NEGATIVE_ZERO], + expected=0, + msg="$indexOfBytes should treat Decimal128 negative zero start as 0", + ), + # end=0 means the search range is empty, so no match. + IndexOfBytesTest( + "num_edge_neg_zero_float_end", + args=["hello", "h", 0, DOUBLE_NEGATIVE_ZERO], + expected=-1, + msg="$indexOfBytes should treat -0.0 float end as 0 yielding empty range", + ), + IndexOfBytesTest( + "num_edge_neg_zero_decimal_end", + args=["hello", "h", 0, DECIMAL128_NEGATIVE_ZERO], + expected=-1, + msg="$indexOfBytes should treat Decimal128 negative zero end as 0 yielding empty range", + ), + IndexOfBytesTest( + "num_edge_decimal_trailing_zeros_start", + args=["hello", "lo", Decimal128("3.0")], + expected=3, + msg="$indexOfBytes should accept Decimal128 with trailing zeros as start", + ), + IndexOfBytesTest( + "num_edge_decimal_trailing_zeros_end", + args=["hello", "lo", 0, Decimal128("5.0")], + expected=3, + msg="$indexOfBytes should accept Decimal128 with trailing zeros as end", + ), + IndexOfBytesTest( + "num_edge_decimal_exponent_start", + args=["hello", "lo", Decimal128("3E0")], + expected=3, + msg="$indexOfBytes should accept Decimal128 with exponent notation as start", + ), + IndexOfBytesTest( + "num_edge_decimal_exponent_end", + args=["hello", "lo", 0, Decimal128("5E0")], + expected=3, + msg="$indexOfBytes should accept Decimal128 with exponent notation as end", + ), +] + +# Property [Negative Index]: negative start or end values produce an error. +INDEXOFBYTES_NEGATIVE_INDEX_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "neg_start", + args=["hello", "h", -1], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative int start", + ), + IndexOfBytesTest( + "neg_end", + args=["hello", "h", 0, -1], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative int end", + ), + IndexOfBytesTest( + "neg_both", + args=["hello", "h", -1, -2], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative start and end", + ), + # Negative floats like -1.0 also produce an error. + IndexOfBytesTest( + "neg_float_start", + args=["hello", "h", -1.0], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative float start", + ), + IndexOfBytesTest( + "neg_float_end", + args=["hello", "h", 0, -1.0], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative float end", + ), + IndexOfBytesTest( + "neg_int64_start", + args=["hello", "h", Int64(-1)], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative Int64 start", + ), + IndexOfBytesTest( + "neg_int64_end", + args=["hello", "h", 0, Int64(-1)], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative Int64 end", + ), + IndexOfBytesTest( + "neg_decimal_start", + args=["hello", "h", Decimal128("-1")], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative Decimal128 start", + ), + IndexOfBytesTest( + "neg_decimal_end", + args=["hello", "h", 0, Decimal128("-1")], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfBytes should reject negative Decimal128 end", + ), +] + +# Property [Numeric Edge Cases - Non-Integral and Out-of-Range Errors]: NaN, infinity, and values +# outside the int32 range produce an index type error for start and end, even when the value is +# negative. +INDEXOFBYTES_NUMERIC_EDGE_ERROR_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "num_edge_nan_start", + args=["hello", "h", FLOAT_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject NaN float as start", + ), + IndexOfBytesTest( + "num_edge_nan_end", + args=["hello", "h", 0, FLOAT_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject NaN float as end", + ), + IndexOfBytesTest( + "num_edge_decimal_nan_start", + args=["hello", "h", Decimal128("NaN")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 NaN as start", + ), + IndexOfBytesTest( + "num_edge_decimal_nan_end", + args=["hello", "h", 0, Decimal128("NaN")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 NaN as end", + ), + IndexOfBytesTest( + "num_edge_inf_start", + args=["hello", "h", FLOAT_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject positive infinity as start", + ), + IndexOfBytesTest( + "num_edge_inf_end", + args=["hello", "h", 0, FLOAT_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject positive infinity as end", + ), + IndexOfBytesTest( + "num_edge_neg_inf_start", + args=["hello", "h", FLOAT_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject negative infinity as start", + ), + IndexOfBytesTest( + "num_edge_neg_inf_end", + args=["hello", "h", 0, FLOAT_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject negative infinity as end", + ), + IndexOfBytesTest( + "num_edge_decimal_inf_start", + args=["hello", "h", DECIMAL128_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 infinity as start", + ), + IndexOfBytesTest( + "num_edge_decimal_inf_end", + args=["hello", "h", 0, DECIMAL128_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 infinity as end", + ), + IndexOfBytesTest( + "num_edge_decimal_neg_inf_start", + args=["hello", "h", DECIMAL128_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 negative infinity as start", + ), + IndexOfBytesTest( + "num_edge_decimal_neg_inf_end", + args=["hello", "h", 0, DECIMAL128_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 negative infinity as end", + ), + IndexOfBytesTest( + "num_edge_int64_over_max_start", + args=["hello", "h", Int64(INT32_OVERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Int64 above int32 max as start", + ), + IndexOfBytesTest( + "num_edge_int64_over_max_end", + args=["hello", "h", 0, Int64(INT32_OVERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Int64 above int32 max as end", + ), + IndexOfBytesTest( + "num_edge_decimal_over_max_start", + args=["hello", "h", Decimal128("2147483648")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 above int32 max as start", + ), + IndexOfBytesTest( + "num_edge_decimal_over_max_end", + args=["hello", "h", 0, Decimal128("2147483648")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject Decimal128 above int32 max as end", + ), + # The int32 range check takes precedence over the negative index check. + IndexOfBytesTest( + "num_edge_int64_under_min_start", + args=["hello", "h", Int64(INT32_UNDERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes int32 range error should precede negative error for start", + ), + IndexOfBytesTest( + "num_edge_int64_under_min_end", + args=["hello", "h", 0, Int64(INT32_UNDERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes int32 range error should precede negative error for end", + ), +] + + +INDEXOFBYTES_INDEX_TYPE_ALL_TESTS = ( + INDEXOFBYTES_INDEX_TYPE_TESTS + + INDEXOFBYTES_NUMERIC_EDGE_SUCCESS_TESTS + + INDEXOFBYTES_NEGATIVE_INDEX_TESTS + + INDEXOFBYTES_NUMERIC_EDGE_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_INDEX_TYPE_ALL_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py new file mode 100644 index 00000000..bc935678 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_ARITY_ERROR, + FAILED_TO_PARSE_ERROR, + INDEXOF_INDEX_TYPE_ERROR, + INDEXOFBYTES_STRING_TYPE_ERROR, + INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + INVALID_DOLLAR_FIELD_PATH, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import DECIMAL128_ONE_AND_HALF +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Arity]: fewer than 2 or more than 4 arguments produces an error. +INDEXOFBYTES_ARITY_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "arity_zero", + args=[], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject zero arguments", + ), + IndexOfBytesTest( + "arity_one", + args=["hello"], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject one argument", + ), + IndexOfBytesTest( + "arity_five", + args=["hello", "h", 0, 5, 1], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject five arguments", + ), +] + +# Property [Syntax]: the argument must be an array; non-array values produce an error. +INDEXOFBYTES_SYNTAX_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "syntax_string", + args="hello", # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject string as argument", + ), + IndexOfBytesTest( + "syntax_int", + args=42, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject int as argument", + ), + IndexOfBytesTest( + "syntax_bool", + args=True, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject bool as argument", + ), + IndexOfBytesTest( + "syntax_binary", + args=Binary(b"data"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject binary as argument", + ), + IndexOfBytesTest( + "syntax_date", + args=datetime(2024, 1, 1, tzinfo=timezone.utc), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject date as argument", + ), + IndexOfBytesTest( + "syntax_decimal128", + args=DECIMAL128_ONE_AND_HALF, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject decimal128 as argument", + ), + IndexOfBytesTest( + "syntax_float", + args=3.14, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject float as argument", + ), + IndexOfBytesTest( + "syntax_long", + args=Int64(42), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject long as argument", + ), + IndexOfBytesTest( + "syntax_maxkey", + args=MaxKey(), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject maxkey as argument", + ), + IndexOfBytesTest( + "syntax_minkey", + args=MinKey(), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject minkey as argument", + ), + IndexOfBytesTest( + "syntax_object", + args={"a": 1}, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject object as argument", + ), + IndexOfBytesTest( + "syntax_objectid", + args=ObjectId("507f1f77bcf86cd799439011"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject objectid as argument", + ), + IndexOfBytesTest( + "syntax_regex", + args=Regex("pattern"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject regex as argument", + ), + IndexOfBytesTest( + "syntax_timestamp", + args=Timestamp(1, 1), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject timestamp as argument", + ), + IndexOfBytesTest( + "syntax_code", + args=Code("function() {}"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject javascript code as argument", + ), + IndexOfBytesTest( + "syntax_code_scope", + args=Code("function() {}", {"x": 1}), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject code with scope as argument", + ), + IndexOfBytesTest( + "syntax_binary_uuid", + args=Binary(b"data", 4), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfBytes should reject binary UUID as argument", + ), +] + +# Property [Expression Returning Wrong Type]: an expression that resolves to the wrong type at +# runtime is rejected with the same error as a literal of that type. +INDEXOFBYTES_EXPR_TYPE_ERROR_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "expr_type_int_string_pos", + args=[{"$add": [1, 2]}, "lo"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject expression resolving to int for string arg", + ), + IndexOfBytesTest( + "expr_type_int_substring_pos", + args=["hello", {"$add": [1, 2]}], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject expression resolving to int for substring arg", + ), + IndexOfBytesTest( + "expr_type_string_start_pos", + args=["hello", "lo", {"$concat": ["x"]}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject expression resolving to string for start arg", + ), +] + +# Property [Dollar Sign Error]: a bare "$" is interpreted as a field path and "$$" is interpreted +# as an empty variable name. +INDEXOFBYTES_DOLLAR_SIGN_ERROR_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "dollar_bare_error", + args=["hello$world", "$"], + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$indexOfBytes should reject bare '$' as field path in substring", + ), + IndexOfBytesTest( + "dollar_double_error", + args=["hello", "$$"], + error_code=FAILED_TO_PARSE_ERROR, + msg="$indexOfBytes should reject '$$' as empty variable name in substring", + ), +] + + +INDEXOFBYTES_INVALID_ARGS_TESTS = ( + INDEXOFBYTES_ARITY_TESTS + + INDEXOFBYTES_SYNTAX_TESTS + + INDEXOFBYTES_EXPR_TYPE_ERROR_TESTS + + INDEXOFBYTES_DOLLAR_SIGN_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_INVALID_ARGS_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py new file mode 100644 index 00000000..a2cb1a5b --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import INDEXOFBYTES_SUBSTRING_TYPE_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import MISSING +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Argument shapes for null/missing first-arg tests. _PLACEHOLDER is replaced with None or MISSING. +_PLACEHOLDER = object() +_NULL_PATTERNS = [ + ([_PLACEHOLDER, "hello"], "first_arg"), + ([_PLACEHOLDER, "hello", 0], "first_arg_with_start"), + ([_PLACEHOLDER, "hello", 0, 5], "first_arg_with_start_end"), + # First arg null/missing takes precedence over second arg null (which would otherwise error). + ([_PLACEHOLDER, None], "precedence_second_null"), + # First arg null/missing takes precedence over start null (which would otherwise error). + ([_PLACEHOLDER, "sub", None], "precedence_start_null"), + # First arg null/missing takes precedence over end null (which would otherwise error). + ([_PLACEHOLDER, "sub", 0, None], "precedence_end_null"), + # First arg null/missing takes precedence over errors from other null args. + ([_PLACEHOLDER, None, None], "precedence_all_null"), + ([_PLACEHOLDER, None, None, None], "precedence_all_four_null"), +] + + +def _build_null_tests(null_value, prefix) -> list[IndexOfBytesTest]: + return [ + IndexOfBytesTest( + f"{prefix}_{suffix}", + args=[null_value if a is _PLACEHOLDER else a for a in args], + expected=None, + msg=f"$indexOfBytes should return null when {prefix} {suffix}", + ) + for args, suffix in _NULL_PATTERNS + ] + + +# Property [Null Behavior]: when the first argument is null, the result is null regardless of other +# arguments. +INDEXOFBYTES_NULL_TESTS = _build_null_tests(None, "null") + + +# Property [Missing Behavior]: when the first argument references a missing field, the result is +# null regardless of other arguments. +INDEXOFBYTES_MISSING_TESTS = _build_null_tests(MISSING, "missing") + +# Property [Null and Missing Errors - Precedence]: second arg error takes precedence over +# third/fourth arg errors. +INDEXOFBYTES_NULL_MISSING_PRECEDENCE_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "null_err_second_precedes_third", + args=["hello", None, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes null substring error should precede null start error", + ), + IndexOfBytesTest( + "null_err_second_precedes_fourth", + args=["hello", None, 0, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes null substring error should precede null end error", + ), + IndexOfBytesTest( + "null_err_second_precedes_all", + args=["hello", None, None, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes null substring error should precede all other null errors", + ), + IndexOfBytesTest( + "missing_err_second_precedes_third", + args=["hello", MISSING, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes missing substring error should precede null start error", + ), + IndexOfBytesTest( + "missing_err_second_precedes_fourth", + args=["hello", MISSING, 0, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes missing substring error should precede null end error", + ), + IndexOfBytesTest( + "missing_err_second_precedes_all", + args=["hello", MISSING, None, None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes missing substring error should precede all other null errors", + ), +] + + +INDEXOFBYTES_NULL_TESTS_ALL = ( + INDEXOFBYTES_NULL_TESTS + + INDEXOFBYTES_MISSING_TESTS + + INDEXOFBYTES_NULL_MISSING_PRECEDENCE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_NULL_TESTS_ALL)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py new file mode 100644 index 00000000..f90d45c2 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py @@ -0,0 +1,313 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [First Occurrence]: when the substring appears multiple times, the result is the byte +# index of the first occurrence. +INDEXOFBYTES_FIRST_OCCURRENCE_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "first_occ_from_start", + args=["abcabc", "abc"], + expected=0, + msg="$indexOfBytes should find first occurrence at start", + ), + IndexOfBytesTest( + "first_occ_start_skips_first", + args=["abcabc", "abc", 1], + expected=3, + msg="$indexOfBytes should find second occurrence when start skips first", + ), + IndexOfBytesTest( + "first_occ_repeated_char", + args=["aaaa", "a"], + expected=0, + msg="$indexOfBytes should find first of repeated single characters", + ), + IndexOfBytesTest( + "first_occ_repeated_char_with_start", + args=["aaaa", "a", 2], + expected=2, + msg="$indexOfBytes should find first occurrence at start offset in repeated chars", + ), + IndexOfBytesTest( + "first_occ_with_end", + args=["abcabc", "abc", 0, 6], + expected=0, + msg="$indexOfBytes should find first occurrence within end boundary", + ), + IndexOfBytesTest( + "first_occ_start_and_end", + args=["abcabc", "abc", 1, 6], + expected=3, + msg="$indexOfBytes should find occurrence within start and end range", + ), +] + +# Property [Empty Substring]: empty substring returns start when start <= byte_length, and -1 when +# start > byte_length, start > end, or start == end > byte_length. +INDEXOFBYTES_EMPTY_SUBSTRING_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "empty_sub_non_empty_string", + args=["hello", ""], + expected=0, + msg="$indexOfBytes should return 0 for empty substring in non-empty string", + ), + IndexOfBytesTest( + "empty_sub_with_start", + args=["hello", "", 3], + expected=3, + msg="$indexOfBytes should return start for empty substring with start offset", + ), + IndexOfBytesTest( + "empty_sub_empty_string", + args=["", ""], + expected=0, + msg="$indexOfBytes should return 0 for empty substring in empty string", + ), + IndexOfBytesTest( + "empty_sub_start_at_byte_length", + args=["hello", "", 5], + expected=5, + msg="$indexOfBytes should return byte length for empty substring at end", + ), + IndexOfBytesTest( + "empty_sub_start_beyond_byte_length", + args=["hello", "", 6], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring beyond byte length", + ), + # Multi-byte string: "café" is 5 bytes (é is 2 bytes). + IndexOfBytesTest( + "empty_sub_multibyte_start_at_byte_length", + args=["café", "", 5], + expected=5, + msg="$indexOfBytes should return byte length for empty substr at end of multi-byte string", + ), + IndexOfBytesTest( + "empty_sub_multibyte_start_beyond_byte_length", + args=["café", "", 6], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring beyond multi-byte string length", + ), + # Empty substring with start == end (empty range) returns start when start <= byte_length. + IndexOfBytesTest( + "empty_sub_start_eq_end_zero", + args=["hello", "", 0, 0], + expected=0, + msg="$indexOfBytes should return 0 for empty substring with start=end=0", + ), + IndexOfBytesTest( + "empty_sub_start_eq_end_mid", + args=["hello", "", 3, 3], + expected=3, + msg="$indexOfBytes should return start for empty substring with start=end mid-string", + ), + IndexOfBytesTest( + "empty_sub_start_eq_end_at_length", + args=["hello", "", 5, 5], + expected=5, + msg="$indexOfBytes should return byte length for empty substring with start=end at length", + ), + IndexOfBytesTest( + "empty_sub_start_eq_end_beyond_length", + args=["hello", "", 6, 6], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring with start=end beyond length", + ), + IndexOfBytesTest( + "empty_sub_empty_str_start_eq_end_zero", + args=["", "", 0, 0], + expected=0, + msg="$indexOfBytes should return 0 for empty substring in empty string with start=end=0", + ), + IndexOfBytesTest( + "empty_sub_empty_str_start_eq_end_beyond", + args=["", "", 1, 1], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring in empty string with start=end=1", + ), + # Multi-byte: "café" is 5 bytes. + IndexOfBytesTest( + "empty_sub_multibyte_start_eq_end_at_length", + args=["café", "", 5, 5], + expected=5, + msg="$indexOfBytes should return byte length for empty substr at start=end multi-byte end", + ), + IndexOfBytesTest( + "empty_sub_multibyte_start_eq_end_beyond", + args=["café", "", 6, 6], + expected=-1, + msg="$indexOfBytes should return -1 for empty substr with start=end beyond multi-byte end", + ), + # Empty substring where start > end returns -1. + IndexOfBytesTest( + "empty_sub_start_gt_end", + args=["hello", "", 4, 2], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring when start > end", + ), + IndexOfBytesTest( + "empty_sub_start_gt_end_at_length", + args=["hello", "", 5, 3], + expected=-1, + msg="$indexOfBytes should return -1 for empty substring when start at length > end", + ), + IndexOfBytesTest( + "empty_sub_multibyte_start_gt_end", + args=["café", "", 5, 3], + expected=-1, + msg="$indexOfBytes should return -1 for empty substr in multi-byte string when start > end", + ), +] + +# Property [Start and End Range]: start and end constrain the byte range searched. +INDEXOFBYTES_RANGE_TESTS: list[IndexOfBytesTest] = [ + # "lo" starts at byte 3, but end=3 means only bytes 0-2 are searched. + IndexOfBytesTest( + "range_end_excludes_match", + args=["hello", "lo", 0, 3], + expected=-1, + msg="$indexOfBytes should return -1 when end excludes the match position", + ), + IndexOfBytesTest( + "range_end_beyond_length", + args=["hello", "lo", 0, 100], + expected=3, + msg="$indexOfBytes should find match when end exceeds string length", + ), + IndexOfBytesTest( + "range_start_greater_than_end", + args=["hello", "lo", 4, 2], + expected=-1, + msg="$indexOfBytes should return -1 when start > end", + ), + IndexOfBytesTest( + "range_start_equals_end", + args=["hello", "lo", 3, 3], + expected=-1, + msg="$indexOfBytes should return -1 when start equals end", + ), + IndexOfBytesTest( + "range_start_at_byte_length", + args=["hello", "lo", 5], + expected=-1, + msg="$indexOfBytes should return -1 when start is at byte length", + ), + IndexOfBytesTest( + "range_start_at_last_byte", + args=["hello", "o", 4], + expected=4, + msg="$indexOfBytes should find single char at last byte position", + ), + IndexOfBytesTest( + "range_start_beyond_byte_length", + args=["hello", "lo", 10], + expected=-1, + msg="$indexOfBytes should return -1 when start is beyond byte length", + ), + # "llo" at index 2 spans bytes 2-4, but end=4 excludes byte 4. + IndexOfBytesTest( + "range_match_extends_beyond_end", + args=["hello", "llo", 0, 4], + expected=-1, + msg="$indexOfBytes should return -1 when match extends beyond end", + ), + IndexOfBytesTest( + "range_match_fits_within_end", + args=["hello", "llo", 0, 5], + expected=2, + msg="$indexOfBytes should find match that fits exactly within end", + ), +] + +# Property [Overlapping Matches]: when overlapping matches exist, the result is the byte index of +# the first overlapping match. +INDEXOFBYTES_OVERLAPPING_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "overlap_ascii", + args=["aaa", "aa"], + expected=0, + msg="$indexOfBytes should return first overlapping match position for ASCII", + ), + # "éé" is 4 bytes; first match at byte 0 in "éééé" (8 bytes). + IndexOfBytesTest( + "overlap_2byte", + args=["éééé", "éé"], + expected=0, + msg="$indexOfBytes should return first overlapping match for 2-byte chars", + ), + # "中中" is 6 bytes; first match at byte 0 in "中中中中" (12 bytes). + IndexOfBytesTest( + "overlap_3byte", + args=["中中中中", "中中"], + expected=0, + msg="$indexOfBytes should return first overlapping match for 3-byte chars", + ), +] + +# Property [Edge Cases]: the operator handles boundary inputs correctly (empty strings, large +# strings, special characters). +INDEXOFBYTES_EDGE_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "edge_empty_string", + args=["", "hello"], + expected=-1, + msg="$indexOfBytes should return -1 searching non-empty substring in empty string", + ), + IndexOfBytesTest( + "edge_at_end", + args=["hello", "o"], + expected=4, + msg="$indexOfBytes should find substring at end of string", + ), + IndexOfBytesTest( + "edge_equals_entire", + args=["hello", "hello"], + expected=0, + msg="$indexOfBytes should return 0 when substring equals entire string", + ), + IndexOfBytesTest( + "edge_longer_than_string", + args=["hi", "hello"], + expected=-1, + msg="$indexOfBytes should return -1 when substring is longer than string", + ), + # Number-like strings are treated as strings, not coerced to numeric types. + IndexOfBytesTest( + "edge_number_like_zero", + args=["a0b3", "0"], + expected=1, + msg="$indexOfBytes should find '0' as a string, not coerce to number", + ), + IndexOfBytesTest( + "edge_number_like_digit", + args=["a0b3", "3"], + expected=3, + msg="$indexOfBytes should find '3' as a string, not coerce to number", + ), +] + + +INDEXOFBYTES_SEARCH_TESTS = ( + INDEXOFBYTES_FIRST_OCCURRENCE_TESTS + + INDEXOFBYTES_EMPTY_SUBSTRING_TESTS + + INDEXOFBYTES_RANGE_TESTS + + INDEXOFBYTES_OVERLAPPING_TESTS + + INDEXOFBYTES_EDGE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_SEARCH_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py new file mode 100644 index 00000000..2de31dde --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [String Size Limit Success]: a string or substring argument one byte under the limit +# is accepted. +INDEXOFBYTES_SIZE_LIMIT_SUCCESS_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "size_string_one_under", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 1), "a"], + expected=0, + msg="$indexOfBytes should accept string one byte under the size limit", + ), + IndexOfBytesTest( + "size_substring_one_under", + args=["hello", "a" * (STRING_SIZE_LIMIT_BYTES - 1)], + expected=-1, + msg="$indexOfBytes should accept substring one byte under the size limit", + ), + # 2-byte chars: one byte under the limit. Limit is byte-based, not codepoint-based. + IndexOfBytesTest( + "size_string_one_under_2byte", + args=["é" * ((STRING_SIZE_LIMIT_BYTES - 1) // 2) + "a", "a"], + expected=STRING_SIZE_LIMIT_BYTES - 2, + msg="$indexOfBytes should accept 2-byte char string one byte under limit", + ), + # Found at end of a large string. + IndexOfBytesTest( + "size_found_at_end", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 2) + "b", "b"], + expected=STRING_SIZE_LIMIT_BYTES - 2, + msg="$indexOfBytes should find match at end of a large string", + ), + # Not found in a large string. + IndexOfBytesTest( + "size_not_found", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 1), "b"], + expected=-1, + msg="$indexOfBytes should return -1 for no match in a large string", + ), +] + +# Property [String Size Limit Error]: a string or substring argument at or above the size limit +# produces an error. +INDEXOFBYTES_SIZE_LIMIT_ERROR_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "size_string_at_limit", + args=["a" * STRING_SIZE_LIMIT_BYTES, "a"], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfBytes should reject string at the size limit", + ), + IndexOfBytesTest( + "size_substring_at_limit", + args=["hello", "a" * STRING_SIZE_LIMIT_BYTES], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfBytes should reject substring at the size limit", + ), + # 2-byte chars: exactly at the limit. Limit is byte-based, not codepoint-based. + IndexOfBytesTest( + "size_string_at_limit_2byte", + args=["é" * (STRING_SIZE_LIMIT_BYTES // 2), "a"], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfBytes should reject 2-byte char string at the byte size limit", + ), +] + + +INDEXOFBYTES_SIZE_LIMIT_TESTS = ( + INDEXOFBYTES_SIZE_LIMIT_SUCCESS_TESTS + INDEXOFBYTES_SIZE_LIMIT_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_SIZE_LIMIT_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py new file mode 100644 index 00000000..76b17dfc --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py @@ -0,0 +1,471 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + INDEXOF_INDEX_TYPE_ERROR, + INDEXOFBYTES_STRING_TYPE_ERROR, + INDEXOFBYTES_SUBSTRING_TYPE_ERROR, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import DECIMAL128_ONE_AND_HALF, MISSING +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Type Strictness]: arguments of incorrect type produce an error. +INDEXOFBYTES_TYPE_ERROR_TESTS: list[IndexOfBytesTest] = [ + # First arg: non-string, non-null types + IndexOfBytesTest( + "type_first_array", + args=[["a"], "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject array as string argument", + ), + IndexOfBytesTest( + "type_first_binary", + args=[Binary(b"data"), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject binary as string argument", + ), + IndexOfBytesTest( + "type_first_bool", + args=[True, "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject bool as string argument", + ), + IndexOfBytesTest( + "type_first_date", + args=[datetime(2024, 1, 1, tzinfo=timezone.utc), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject date as string argument", + ), + IndexOfBytesTest( + "type_first_decimal128", + args=[DECIMAL128_ONE_AND_HALF, "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject decimal128 as string argument", + ), + IndexOfBytesTest( + "type_first_float", + args=[3.14, "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject float as string argument", + ), + IndexOfBytesTest( + "type_first_int", + args=[42, "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject int as string argument", + ), + IndexOfBytesTest( + "type_first_long", + args=[Int64(42), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject long as string argument", + ), + IndexOfBytesTest( + "type_first_maxkey", + args=[MaxKey(), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject maxkey as string argument", + ), + IndexOfBytesTest( + "type_first_minkey", + args=[MinKey(), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject minkey as string argument", + ), + IndexOfBytesTest( + "type_first_object", + args=[{"a": 1}, "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject object as string argument", + ), + IndexOfBytesTest( + "type_first_objectid", + args=[ObjectId("507f1f77bcf86cd799439011"), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject objectid as string argument", + ), + IndexOfBytesTest( + "type_first_regex", + args=[Regex("pattern"), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject regex as string argument", + ), + IndexOfBytesTest( + "type_first_timestamp", + args=[Timestamp(1, 1), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject timestamp as string argument", + ), + IndexOfBytesTest( + "type_first_code", + args=[Code("function() {}"), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject javascript code as string argument", + ), + IndexOfBytesTest( + "type_first_code_scope", + args=[Code("function() {}", {"x": 1}), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject code with scope as string argument", + ), + IndexOfBytesTest( + "type_first_binary_uuid", + args=[Binary(b"data", 4), "sub"], + error_code=INDEXOFBYTES_STRING_TYPE_ERROR, + msg="$indexOfBytes should reject binary UUID as string argument", + ), + # Second arg: non-string types + IndexOfBytesTest( + "type_second_array", + args=["hello", ["a"]], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject array as substring argument", + ), + IndexOfBytesTest( + "type_second_binary", + args=["hello", Binary(b"data")], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject binary as substring argument", + ), + IndexOfBytesTest( + "type_second_bool", + args=["hello", True], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject bool as substring argument", + ), + IndexOfBytesTest( + "type_second_date", + args=["hello", datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject date as substring argument", + ), + IndexOfBytesTest( + "type_second_decimal128", + args=["hello", DECIMAL128_ONE_AND_HALF], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject decimal128 as substring argument", + ), + IndexOfBytesTest( + "type_second_float", + args=["hello", 3.14], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject float as substring argument", + ), + IndexOfBytesTest( + "type_second_int", + args=["hello", 42], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject int as substring argument", + ), + IndexOfBytesTest( + "type_second_long", + args=["hello", Int64(42)], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject long as substring argument", + ), + IndexOfBytesTest( + "type_second_maxkey", + args=["hello", MaxKey()], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject maxkey as substring argument", + ), + IndexOfBytesTest( + "type_second_minkey", + args=["hello", MinKey()], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject minkey as substring argument", + ), + IndexOfBytesTest( + "type_second_object", + args=["hello", {"a": 1}], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject object as substring argument", + ), + IndexOfBytesTest( + "type_second_objectid", + args=["hello", ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject objectid as substring argument", + ), + IndexOfBytesTest( + "type_second_regex", + args=["hello", Regex("pattern")], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject regex as substring argument", + ), + IndexOfBytesTest( + "type_second_timestamp", + args=["hello", Timestamp(1, 1)], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject timestamp as substring argument", + ), + IndexOfBytesTest( + "type_second_code", + args=["hello", Code("function() {}")], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject javascript code as substring argument", + ), + IndexOfBytesTest( + "type_second_code_scope", + args=["hello", Code("function() {}", {"x": 1})], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject code with scope as substring argument", + ), + IndexOfBytesTest( + "type_second_binary_uuid", + args=["hello", Binary(b"data", 4)], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject binary UUID as substring argument", + ), + IndexOfBytesTest( + "type_second_null", + args=["hello", None], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject null as substring argument", + ), + IndexOfBytesTest( + "type_second_missing", + args=["hello", MISSING], + error_code=INDEXOFBYTES_SUBSTRING_TYPE_ERROR, + msg="$indexOfBytes should reject missing field as substring argument", + ), + # Third arg (start): non-numeric types + IndexOfBytesTest( + "type_third_array", + args=["hello", "h", ["a"]], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject array as start argument", + ), + IndexOfBytesTest( + "type_third_binary", + args=["hello", "h", Binary(b"data")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject binary as start argument", + ), + IndexOfBytesTest( + "type_third_bool", + args=["hello", "h", True], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject bool as start argument", + ), + IndexOfBytesTest( + "type_third_date", + args=["hello", "h", datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject date as start argument", + ), + IndexOfBytesTest( + "type_third_maxkey", + args=["hello", "h", MaxKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject maxkey as start argument", + ), + IndexOfBytesTest( + "type_third_minkey", + args=["hello", "h", MinKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject minkey as start argument", + ), + IndexOfBytesTest( + "type_third_object", + args=["hello", "h", {"a": 1}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject object as start argument", + ), + IndexOfBytesTest( + "type_third_objectid", + args=["hello", "h", ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject objectid as start argument", + ), + IndexOfBytesTest( + "type_third_regex", + args=["hello", "h", Regex("pattern")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject regex as start argument", + ), + IndexOfBytesTest( + "type_third_timestamp", + args=["hello", "h", Timestamp(1, 1)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject timestamp as start argument", + ), + IndexOfBytesTest( + "type_third_code", + args=["hello", "h", Code("function() {}")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject javascript code as start argument", + ), + IndexOfBytesTest( + "type_third_code_scope", + args=["hello", "h", Code("function() {}", {"x": 1})], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject code with scope as start argument", + ), + IndexOfBytesTest( + "type_third_binary_uuid", + args=["hello", "h", Binary(b"data", 4)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject binary UUID as start argument", + ), + IndexOfBytesTest( + "type_third_string", + args=["hello", "h", "x"], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject string as start argument", + ), + IndexOfBytesTest( + "type_third_null", + args=["hello", "h", None], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject null as start argument", + ), + IndexOfBytesTest( + "type_third_missing", + args=["hello", "h", MISSING], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject missing field as start argument", + ), + # Fourth arg (end): non-numeric types + IndexOfBytesTest( + "type_fourth_array", + args=["hello", "h", 0, ["a"]], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject array as end argument", + ), + IndexOfBytesTest( + "type_fourth_binary", + args=["hello", "h", 0, Binary(b"data")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject binary as end argument", + ), + IndexOfBytesTest( + "type_fourth_bool", + args=["hello", "h", 0, True], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject bool as end argument", + ), + IndexOfBytesTest( + "type_fourth_date", + args=["hello", "h", 0, datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject date as end argument", + ), + IndexOfBytesTest( + "type_fourth_maxkey", + args=["hello", "h", 0, MaxKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject maxkey as end argument", + ), + IndexOfBytesTest( + "type_fourth_minkey", + args=["hello", "h", 0, MinKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject minkey as end argument", + ), + IndexOfBytesTest( + "type_fourth_object", + args=["hello", "h", 0, {"a": 1}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject object as end argument", + ), + IndexOfBytesTest( + "type_fourth_objectid", + args=["hello", "h", 0, ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject objectid as end argument", + ), + IndexOfBytesTest( + "type_fourth_regex", + args=["hello", "h", 0, Regex("pattern")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject regex as end argument", + ), + IndexOfBytesTest( + "type_fourth_timestamp", + args=["hello", "h", 0, Timestamp(1, 1)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject timestamp as end argument", + ), + IndexOfBytesTest( + "type_fourth_code", + args=["hello", "h", 0, Code("function() {}")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject javascript code as end argument", + ), + IndexOfBytesTest( + "type_fourth_code_scope", + args=["hello", "h", 0, Code("function() {}", {"x": 1})], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject code with scope as end argument", + ), + IndexOfBytesTest( + "type_fourth_binary_uuid", + args=["hello", "h", 0, Binary(b"data", 4)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject binary UUID as end argument", + ), + IndexOfBytesTest( + "type_fourth_string", + args=["hello", "h", 0, "x"], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject string as end argument", + ), + IndexOfBytesTest( + "type_fourth_null", + args=["hello", "h", 0, None], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject null as end argument", + ), + IndexOfBytesTest( + "type_fourth_missing", + args=["hello", "h", 0, MISSING], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject missing field as end argument", + ), + # Non-integral floats and decimals are rejected as index arguments even though + # whole-number values of these types are accepted (see INDEXOFBYTES_INDEX_TYPE_TESTS). + IndexOfBytesTest( + "type_third_non_integral_float", + args=["hello", "h", 3.14], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject non-integral float as start", + ), + IndexOfBytesTest( + "type_third_non_integral_decimal128", + args=["hello", "h", DECIMAL128_ONE_AND_HALF], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject non-integral Decimal128 as start", + ), + IndexOfBytesTest( + "type_fourth_non_integral_float", + args=["hello", "h", 0, 3.14], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject non-integral float as end", + ), + IndexOfBytesTest( + "type_fourth_non_integral_decimal128", + args=["hello", "h", 0, DECIMAL128_ONE_AND_HALF], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfBytes should reject non-integral Decimal128 as end", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_TYPE_ERROR_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py new file mode 100644 index 00000000..401226d1 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + IndexOfBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project_with_insert, +) + +# Property [Expression Arguments]: all argument positions accept expressions that resolve to the +# expected type. +INDEXOFBYTES_EXPR_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "expr_first_arg", + args=[{"$concat": ["hel", "lo"]}, "lo"], + expected=3, + msg="$indexOfBytes should accept expression for string argument", + ), + IndexOfBytesTest( + "expr_second_arg", + args=["hello", {"$concat": ["l", "o"]}], + expected=3, + msg="$indexOfBytes should accept expression for substring argument", + ), + IndexOfBytesTest( + "expr_third_arg", + args=["hello", "lo", {"$add": [1, 2]}], + expected=3, + msg="$indexOfBytes should accept expression for start argument", + ), + IndexOfBytesTest( + "expr_fourth_arg", + args=["hello", "lo", 0, {"$add": [3, 2]}], + expected=3, + msg="$indexOfBytes should accept expression for end argument", + ), +] + +# Property [Dollar Sign Literal]: using $literal to pass a "$" substring avoids field path +# interpretation and finds the dollar sign correctly. +INDEXOFBYTES_DOLLAR_SIGN_SUCCESS_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "dollar_literal_found", + args=["hello$world", {"$literal": "$"}], + expected=5, + msg="$indexOfBytes should find dollar sign via $literal", + ), +] + + +INDEXOFBYTES_USAGE_TESTS = INDEXOFBYTES_EXPR_TESTS + INDEXOFBYTES_DOLLAR_SIGN_SUCCESS_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_USAGE_TESTS)) +def test_indexofbytes_cases(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes cases.""" + result = execute_expression(collection, {"$indexOfBytes": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) + + +# Property [Document Field References]: $indexOfBytes works with field references +# from inserted documents, not just inline literals. +def test_indexofbytes_document_fields(collection): + """Test $indexOfBytes reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "hello", "sub": "lo"}, + {"result": {"$indexOfBytes": ["$s", "$sub"]}}, + ) + assertSuccess( + result, + [{"result": 3}], + msg="$indexOfBytes should find substring from document field references", + ) + + +# Property [Nested Field Paths]: $indexOfBytes resolves dotted field paths in nested documents. +def test_indexofbytes_nested_field_paths(collection): + """Test $indexOfBytes reads values from nested document field paths.""" + result = execute_project_with_insert( + collection, + {"a": {"b": "hello"}, "c": {"d": "lo"}}, + {"result": {"$indexOfBytes": ["$a.b", "$c.d"]}}, + ) + assertSuccess( + result, + [{"result": 3}], + msg="$indexOfBytes should find substring from nested field paths", + ) + + +# Property [Return Type]: the result is int (not float or long) when the expression succeeds. +INDEXOFBYTES_RETURN_TYPE_TESTS: list[IndexOfBytesTest] = [ + IndexOfBytesTest( + "return_type_two_args", + args=["hello", "ell"], + msg="$indexOfBytes should return int type with two args", + ), + IndexOfBytesTest( + "return_type_three_args", + args=["hello", "lo", 2], + msg="$indexOfBytes should return int type with three args", + ), + IndexOfBytesTest( + "return_type_four_args", + args=["hello", "ll", 0, 5], + msg="$indexOfBytes should return int type with four args", + ), + IndexOfBytesTest( + "return_type_not_found", + args=["hello", "xyz"], + msg="$indexOfBytes should return int type when not found", + ), + IndexOfBytesTest( + "return_type_large_index", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 2) + "b", "b"], + msg="$indexOfBytes should return int type for large index value", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFBYTES_RETURN_TYPE_TESTS)) +def test_indexofbytes_return_type(collection, test_case: IndexOfBytesTest): + """Test $indexOfBytes result is always type int.""" + result = execute_expression(collection, {"$type": {"$indexOfBytes": test_case.args}}) + assertSuccess(result, [{"result": "int"}], msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/indexOfBytes_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/indexOfBytes_common.py new file mode 100644 index 00000000..8e1d6d0c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/utils/indexOfBytes_common.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from documentdb_tests.framework.test_case import BaseTestCase + + +@dataclass(frozen=True) +class IndexOfBytesTest(BaseTestCase): + """Test case for $indexOfBytes operator.""" + + # Uses args because start and end are optional positional parameters. + # Named fields would be ambiguous about whether an unset optional + # should be omitted from the array or passed as None. + args: list[Any] = None # type: ignore[assignment] diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py new file mode 100644 index 00000000..0d2ccf67 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py @@ -0,0 +1,294 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Code Point Indexing]: the result is a UTF-8 code point index, not a byte index. +INDEXOFCP_CODEPOINT_TESTS: list[IndexOfCPTest] = [ + # Character after 2-byte é (U+00E9): code point index 1 (byte index would be 2). + IndexOfCPTest( + "cp_after_2byte", + args=["éa", "a"], + expected=1, + msg="$indexOfCP should return cp index 1 after 2-byte character", + ), + # Character after 3-byte 日 (U+65E5): code point index 1 (byte index would be 3). + IndexOfCPTest( + "cp_after_3byte", + args=["日a", "a"], + expected=1, + msg="$indexOfCP should return cp index 1 after 3-byte character", + ), + # Character after 4-byte 🎉 (U+1F389): code point index 1 (byte index would be 4). + IndexOfCPTest( + "cp_after_4byte", + args=["🎉a", "a"], + expected=1, + msg="$indexOfCP should return cp index 1 after 4-byte character", + ), + # Finding a multi-byte substring returns its code point index, not byte index. + IndexOfCPTest( + "cp_find_multibyte_substr", + args=["café", "é"], + expected=3, + msg="$indexOfCP should find multi-byte substring at code point index", + ), + # Decomposed é ("e" U+0065 + combining acute U+0301) found in a decomposed string. + IndexOfCPTest( + "cp_find_decomposed_in_decomposed", + args=["cafe\u0301", "e\u0301"], + expected=3, + msg="$indexOfCP should find decomposed sequence in decomposed string", + ), + # ASCII "e" does not match precomposed é (U+00E9). + IndexOfCPTest( + "cp_ascii_vs_precomposed", + args=["café", "e"], + expected=-1, + msg="$indexOfCP should not match ASCII 'e' against precomposed é", + ), + # Decomposed é is "e" (U+0065) + combining acute (U+0301). "e" matches at code point 3. + IndexOfCPTest( + "cp_ascii_in_decomposed", + args=["cafe\u0301", "e"], + expected=3, + msg="$indexOfCP should find ASCII 'e' in decomposed é sequence", + ), + # Precomposed é (U+00E9) and decomposed e + combining acute (U+0301) are distinct code point + # sequences. + IndexOfCPTest( + "cp_precomposed_in_decomposed", + args=["cafe\u0301", "\u00e9"], + expected=-1, + msg="$indexOfCP should not find precomposed é in decomposed string", + ), + IndexOfCPTest( + "cp_decomposed_in_precomposed", + args=["caf\u00e9", "e\u0301"], + expected=-1, + msg="$indexOfCP should not find decomposed é in precomposed string", + ), + # Start and end parameters are code point offsets, not byte offsets. + IndexOfCPTest( + "cp_multibyte_start", + args=["寿司", "司", 1], + expected=1, + msg="$indexOfCP should find 3-byte char at cp offset with start", + ), + IndexOfCPTest( + "cp_multibyte_start_end_excludes", + args=["寿司", "司", 1, 1], + expected=-1, + msg="$indexOfCP should return -1 when end excludes match in multi-byte string", + ), + IndexOfCPTest( + "cp_multibyte_start_end_includes", + args=["寿司", "司", 1, 2], + expected=1, + msg="$indexOfCP should find match when end includes it in multi-byte string", + ), + IndexOfCPTest( + "cp_multibyte_start_past_match", + args=["寿司", "司", 2], + expected=-1, + msg="$indexOfCP should return -1 when start is past match in multi-byte string", + ), + # Null bytes are single code points and do not terminate the string. + IndexOfCPTest( + "cp_null_byte_found", + args=["a\x00b", "\x00"], + expected=1, + msg="$indexOfCP should find embedded null byte", + ), + IndexOfCPTest( + "cp_after_null_byte", + args=["a\x00b", "b"], + expected=2, + msg="$indexOfCP should find character after embedded null byte", + ), + IndexOfCPTest( + "cp_null_byte_with_start", + args=["a\x00a\x00", "\x00", 2], + expected=3, + msg="$indexOfCP should find second null byte when start skips first", + ), + IndexOfCPTest( + "cp_null_byte_with_start_end", + args=["\x00a\x00ab", "\x00", 2, 3], + expected=2, + msg="$indexOfCP should find null byte within start/end range", + ), +] + + +# Property [Case Sensitivity]: search is strictly case-sensitive with no Unicode case folding or +# locale-dependent mapping. +INDEXOFCP_CASE_SENSITIVITY_TESTS: list[IndexOfCPTest] = [ + # ASCII case differences are not matched. + IndexOfCPTest( + "case_ascii_upper_in_lower", + args=["hello", "E"], + expected=-1, + msg="$indexOfCP should not find uppercase in lowercase ASCII", + ), + IndexOfCPTest( + "case_ascii_lower_in_upper", + args=["HELLO", "e"], + expected=-1, + msg="$indexOfCP should not find lowercase in uppercase ASCII", + ), + # Greek sigma: uppercase Σ (U+03A3) vs lowercase σ (U+03C3). + IndexOfCPTest( + "case_greek_sigma_upper_in_lower", + args=["σ", "Σ"], + expected=-1, + msg="$indexOfCP should not case-fold Greek sigma", + ), + IndexOfCPTest( + "case_greek_sigma_lower_in_upper", + args=["Σ", "σ"], + expected=-1, + msg="$indexOfCP should not case-fold Greek uppercase sigma", + ), + # Cyrillic: uppercase Д (U+0414) vs lowercase д (U+0434). + IndexOfCPTest( + "case_cyrillic_upper_in_lower", + args=["д", "Д"], + expected=-1, + msg="$indexOfCP should not case-fold Cyrillic de", + ), + IndexOfCPTest( + "case_cyrillic_lower_in_upper", + args=["Д", "д"], + expected=-1, + msg="$indexOfCP should not case-fold Cyrillic uppercase de", + ), + # No sharp-s expansion: ß (U+00DF) does not match "SS" or "ss". + IndexOfCPTest( + "case_sharp_s_vs_upper_ss", + args=["ß", "SS"], + expected=-1, + msg="$indexOfCP should not expand sharp-s to SS", + ), + IndexOfCPTest( + "case_sharp_s_vs_lower_ss", + args=["ß", "ss"], + expected=-1, + msg="$indexOfCP should not expand sharp-s to ss", + ), + # No ligature expansion: fi (U+FB01) does not match "fi". + IndexOfCPTest( + "case_ligature_fi", + args=["fi", "fi"], + expected=-1, + msg="$indexOfCP should not expand fi ligature", + ), + # No locale-dependent mapping: ı (U+0131, Turkish dotless i) does not match "i". + IndexOfCPTest( + "case_turkish_dotless_i", + args=["ı", "i"], + expected=-1, + msg="$indexOfCP should not map dotless i to ASCII i", + ), +] + + +# Property [Encoding and Character Handling]: special Unicode characters including control +# characters, non-ASCII whitespace, and zero-width characters are individually findable at their +# code point offsets. +INDEXOFCP_ENCODING_TESTS: list[IndexOfCPTest] = [ + # Control characters are single code points. + IndexOfCPTest( + "enc_control_soh", + args=["a\x01b", "\x01"], + expected=1, + msg="$indexOfCP should find SOH control character", + ), + IndexOfCPTest( + "enc_control_us", + args=["a\x1fb", "\x1f"], + expected=1, + msg="$indexOfCP should find US control character", + ), + IndexOfCPTest( + "enc_newline", + args=["line1\nline2", "\n"], + expected=5, + msg="$indexOfCP should find newline character", + ), + IndexOfCPTest( + "enc_tab", args=["col1\tcol2", "\t"], expected=4, msg="$indexOfCP should find tab character" + ), + IndexOfCPTest( + "enc_carriage_return", + args=["line1\rline2", "\r"], + expected=5, + msg="$indexOfCP should find carriage return character", + ), + # Non-breaking space (U+00A0) is distinct from ASCII space. + IndexOfCPTest( + "enc_nbsp_found", + args=["a\u00a0b", "\u00a0"], + expected=1, + msg="$indexOfCP should find non-breaking space", + ), + IndexOfCPTest( + "enc_nbsp_not_ascii_space", + args=["a\u00a0b", " "], + expected=-1, + msg="$indexOfCP should not match ASCII space against NBSP", + ), + # En space (U+2000) is distinct from ASCII space. + IndexOfCPTest( + "enc_en_space_found", + args=["a\u2000b", "\u2000"], + expected=1, + msg="$indexOfCP should find en space", + ), + IndexOfCPTest( + "enc_en_space_not_ascii_space", + args=["a\u2000b", " "], + expected=-1, + msg="$indexOfCP should not match ASCII space against en space", + ), + # BOM (U+FEFF). + IndexOfCPTest( + "enc_bom_found", + args=["a\ufeffb", "\ufeff"], + expected=1, + msg="$indexOfCP should find BOM character", + ), + # ZWSP (U+200B). + IndexOfCPTest( + "enc_zwsp_found", + args=["a\u200bb", "\u200b"], + expected=1, + msg="$indexOfCP should find zero-width space", + ), + # Combining mark alone (U+0301) is findable within a combining sequence. + IndexOfCPTest( + "enc_combining_mark_alone", + args=["cafe\u0301", "\u0301"], + expected=4, + msg="$indexOfCP should find combining mark alone at code point offset", + ), +] + +INDEXOFCP_ENCODING_AND_CASE_TESTS = ( + INDEXOFCP_CODEPOINT_TESTS + INDEXOFCP_CASE_SENSITIVITY_TESTS + INDEXOFCP_ENCODING_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_ENCODING_AND_CASE_TESTS)) +def test_indexofcp_encoding(collection, test_case: IndexOfCPTest): + """Test $indexOfCP encoding and case sensitivity.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py new file mode 100644 index 00000000..4a023539 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py @@ -0,0 +1,317 @@ +from __future__ import annotations + +import pytest +from bson import Decimal128, Int64 + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import INDEXOF_INDEX_TYPE_ERROR, INDEXOF_NEGATIVE_INDEX_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_NAN, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_NEGATIVE_ZERO, + DECIMAL128_ZERO, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_OVERFLOW, + INT32_UNDERFLOW, + INT64_ZERO, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Index Type Acceptance]: integral Decimal128, Int64, and whole-number floats are accepted +# for start and end. +INDEXOFCP_INDEX_TYPE_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "idx_type_float_start", + args=["hello", "lo", 3.0], + expected=3, + msg="$indexOfCP should accept whole-number float as start", + ), + IndexOfCPTest( + "idx_type_float_start_and_end", + args=["hello", "lo", DOUBLE_ZERO, 5.0], + expected=3, + msg="$indexOfCP should accept whole-number floats as start and end", + ), + IndexOfCPTest( + "idx_type_decimal128_start", + args=["hello", "lo", Decimal128("3")], + expected=3, + msg="$indexOfCP should accept Decimal128 as start", + ), + IndexOfCPTest( + "idx_type_decimal128_start_and_end", + args=["hello", "lo", DECIMAL128_ZERO, Decimal128("5")], + expected=3, + msg="$indexOfCP should accept Decimal128 as start and end", + ), + IndexOfCPTest( + "idx_type_int64_start", + args=["hello", "lo", Int64(3)], + expected=3, + msg="$indexOfCP should accept Int64 as start", + ), + IndexOfCPTest( + "idx_type_int64_start_and_end", + args=["hello", "lo", INT64_ZERO, Int64(5)], + expected=3, + msg="$indexOfCP should accept Int64 as start and end", + ), +] + + +# Property [Numeric Edge Cases - Success]: negative zero, Decimal128 with trailing zeros, and +# Decimal128 exponent notation are accepted as start and end values. +INDEXOFCP_NUMERIC_EDGE_SUCCESS_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "num_edge_neg_zero_float_start", + args=["hello", "h", DOUBLE_NEGATIVE_ZERO], + expected=0, + msg="$indexOfCP should treat -0.0 float start as 0", + ), + IndexOfCPTest( + "num_edge_neg_zero_decimal_start", + args=["hello", "h", DECIMAL128_NEGATIVE_ZERO], + expected=0, + msg="$indexOfCP should treat Decimal128 negative zero start as 0", + ), + # end=0 means the search range is empty, so no match. + IndexOfCPTest( + "num_edge_neg_zero_float_end", + args=["hello", "h", 0, DOUBLE_NEGATIVE_ZERO], + expected=-1, + msg="$indexOfCP should treat -0.0 float end as 0 yielding empty range", + ), + IndexOfCPTest( + "num_edge_neg_zero_decimal_end", + args=["hello", "h", 0, DECIMAL128_NEGATIVE_ZERO], + expected=-1, + msg="$indexOfCP should treat Decimal128 negative zero end as 0 yielding empty range", + ), + IndexOfCPTest( + "num_edge_decimal_trailing_zeros_start", + args=["hello", "lo", Decimal128("3.0")], + expected=3, + msg="$indexOfCP should accept Decimal128 with trailing zeros as start", + ), + IndexOfCPTest( + "num_edge_decimal_trailing_zeros_end", + args=["hello", "lo", 0, Decimal128("5.0")], + expected=3, + msg="$indexOfCP should accept Decimal128 with trailing zeros as end", + ), + IndexOfCPTest( + "num_edge_decimal_exponent_start", + args=["hello", "lo", Decimal128("3E0")], + expected=3, + msg="$indexOfCP should accept Decimal128 with exponent notation as start", + ), + IndexOfCPTest( + "num_edge_decimal_exponent_end", + args=["hello", "lo", 0, Decimal128("5E0")], + expected=3, + msg="$indexOfCP should accept Decimal128 with exponent notation as end", + ), +] + +INDEXOFCP_INDEX_TYPE_AND_NUMERIC_EDGE_TESTS = ( + INDEXOFCP_INDEX_TYPE_TESTS + INDEXOFCP_NUMERIC_EDGE_SUCCESS_TESTS +) + + +# Property [Negative Index]: negative start or end values produce an error. +INDEXOFCP_NEGATIVE_INDEX_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "neg_start", + args=["hello", "h", -1], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative int start", + ), + IndexOfCPTest( + "neg_end", + args=["hello", "h", 0, -1], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative int end", + ), + IndexOfCPTest( + "neg_both", + args=["hello", "h", -1, -2], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative start and end", + ), + # Negative floats like -1.0 also produce an error. + IndexOfCPTest( + "neg_float_start", + args=["hello", "h", -1.0], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative float start", + ), + IndexOfCPTest( + "neg_float_end", + args=["hello", "h", 0, -1.0], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative float end", + ), + IndexOfCPTest( + "neg_int64_start", + args=["hello", "h", Int64(-1)], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative Int64 start", + ), + IndexOfCPTest( + "neg_int64_end", + args=["hello", "h", 0, Int64(-1)], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative Int64 end", + ), + IndexOfCPTest( + "neg_decimal_start", + args=["hello", "h", Decimal128("-1")], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative Decimal128 start", + ), + IndexOfCPTest( + "neg_decimal_end", + args=["hello", "h", 0, Decimal128("-1")], + error_code=INDEXOF_NEGATIVE_INDEX_ERROR, + msg="$indexOfCP should reject negative Decimal128 end", + ), +] + + +# Property [Numeric Edge Cases - Error]: NaN, infinity, and out-of-int32-range values are rejected +# as start and end arguments. +INDEXOFCP_NUMERIC_EDGE_ERROR_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "num_edge_nan_start", + args=["hello", "h", FLOAT_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject NaN float as start", + ), + IndexOfCPTest( + "num_edge_nan_end", + args=["hello", "h", 0, FLOAT_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject NaN float as end", + ), + IndexOfCPTest( + "num_edge_decimal_nan_start", + args=["hello", "h", DECIMAL128_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 NaN as start", + ), + IndexOfCPTest( + "num_edge_decimal_nan_end", + args=["hello", "h", 0, DECIMAL128_NAN], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 NaN as end", + ), + IndexOfCPTest( + "num_edge_inf_start", + args=["hello", "h", FLOAT_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject positive infinity as start", + ), + IndexOfCPTest( + "num_edge_inf_end", + args=["hello", "h", 0, FLOAT_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject positive infinity as end", + ), + IndexOfCPTest( + "num_edge_neg_inf_start", + args=["hello", "h", FLOAT_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject negative infinity as start", + ), + IndexOfCPTest( + "num_edge_neg_inf_end", + args=["hello", "h", 0, FLOAT_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject negative infinity as end", + ), + IndexOfCPTest( + "num_edge_decimal_inf_start", + args=["hello", "h", DECIMAL128_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 infinity as start", + ), + IndexOfCPTest( + "num_edge_decimal_inf_end", + args=["hello", "h", 0, DECIMAL128_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 infinity as end", + ), + IndexOfCPTest( + "num_edge_decimal_neg_inf_start", + args=["hello", "h", DECIMAL128_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 negative infinity as start", + ), + IndexOfCPTest( + "num_edge_decimal_neg_inf_end", + args=["hello", "h", 0, DECIMAL128_NEGATIVE_INFINITY], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 negative infinity as end", + ), + IndexOfCPTest( + "num_edge_int64_over_max_start", + args=["hello", "h", Int64(INT32_OVERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Int64 above int32 max as start", + ), + IndexOfCPTest( + "num_edge_int64_over_max_end", + args=["hello", "h", 0, Int64(INT32_OVERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Int64 above int32 max as end", + ), + IndexOfCPTest( + "num_edge_decimal_over_max_start", + args=["hello", "h", Decimal128("2147483648")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 above int32 max as start", + ), + IndexOfCPTest( + "num_edge_decimal_over_max_end", + args=["hello", "h", 0, Decimal128("2147483648")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject Decimal128 above int32 max as end", + ), + # The int32 range check takes precedence over the negative check. + IndexOfCPTest( + "num_edge_int64_under_min_start", + args=["hello", "h", Int64(INT32_UNDERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP int32 range error should precede negative error for start", + ), + IndexOfCPTest( + "num_edge_int64_under_min_end", + args=["hello", "h", 0, Int64(INT32_UNDERFLOW)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP int32 range error should precede negative error for end", + ), +] + +INDEXOFCP_ALL_INDEX_TESTS = ( + INDEXOFCP_INDEX_TYPE_AND_NUMERIC_EDGE_TESTS + + INDEXOFCP_NEGATIVE_INDEX_TESTS + + INDEXOFCP_NUMERIC_EDGE_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_ALL_INDEX_TESTS)) +def test_indexofcp_index_types(collection, test_case: IndexOfCPTest): + """Test $indexOfCP index type acceptance and expression arguments.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py new file mode 100644 index 00000000..3b494cce --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_ARITY_ERROR, + FAILED_TO_PARSE_ERROR, + INDEXOF_INDEX_TYPE_ERROR, + INDEXOFCP_STRING_TYPE_ERROR, + INDEXOFCP_SUBSTRING_TYPE_ERROR, + INVALID_DOLLAR_FIELD_PATH, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Arity]: fewer than 2 or more than 4 arguments produces an error. +INDEXOFCP_ARITY_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "arity_zero", + args=[], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject zero arguments", + ), + IndexOfCPTest( + "arity_one", + args=["hello"], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject one argument", + ), + IndexOfCPTest( + "arity_five", + args=["hello", "h", 0, 5, 1], + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject five arguments", + ), +] + + +# Property [Syntax]: the argument must be an array; non-array values produce an error. +INDEXOFCP_SYNTAX_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "syntax_string", + args="hello", # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject string as argument", + ), + IndexOfCPTest( + "syntax_int", + args=42, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject int as argument", + ), + IndexOfCPTest( + "syntax_bool", + args=True, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject bool as argument", + ), + IndexOfCPTest( + "syntax_binary", + args=Binary(b"data"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject binary as argument", + ), + IndexOfCPTest( + "syntax_date", + args=datetime(2024, 1, 1, tzinfo=timezone.utc), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject date as argument", + ), + IndexOfCPTest( + "syntax_decimal128", + args=DECIMAL128_ONE_AND_HALF, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject decimal128 as argument", + ), + IndexOfCPTest( + "syntax_float", + args=3.14, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject float as argument", + ), + IndexOfCPTest( + "syntax_long", + args=Int64(42), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject long as argument", + ), + IndexOfCPTest( + "syntax_maxkey", + args=MaxKey(), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject maxkey as argument", + ), + IndexOfCPTest( + "syntax_minkey", + args=MinKey(), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject minkey as argument", + ), + IndexOfCPTest( + "syntax_object", + args={"a": 1}, # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject object as argument", + ), + IndexOfCPTest( + "syntax_objectid", + args=ObjectId("507f1f77bcf86cd799439011"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject objectid as argument", + ), + IndexOfCPTest( + "syntax_regex", + args=Regex("pattern"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject regex as argument", + ), + IndexOfCPTest( + "syntax_timestamp", + args=Timestamp(1, 1), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject timestamp as argument", + ), + IndexOfCPTest( + "syntax_code", + args=Code("function() {}"), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject javascript code as argument", + ), + IndexOfCPTest( + "syntax_code_scope", + args=Code("function() {}", {"x": 1}), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject code with scope as argument", + ), + IndexOfCPTest( + "syntax_binary_uuid", + args=Binary(b"data", 4), # type: ignore[arg-type] + error_code=EXPRESSION_ARITY_ERROR, + msg="$indexOfCP should reject binary UUID as argument", + ), +] + + +# Property [Expression Returning Wrong Type]: an expression that resolves to the wrong type at +# runtime is rejected with the appropriate type error for that argument position. +INDEXOFCP_EXPR_TYPE_ERROR_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "expr_type_int_in_string_pos", + args=[{"$add": [1, 2]}, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject expression resolving to int for string arg", + ), + IndexOfCPTest( + "expr_type_int_in_substring_pos", + args=["hello", {"$add": [1, 2]}], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject expression resolving to int for substring arg", + ), + IndexOfCPTest( + "expr_type_string_in_start_pos", + args=["hello", "lo", {"$concat": ["x"]}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject expression resolving to string for start arg", + ), +] + + +# Property [Dollar Sign Handling - Error]: a bare "$" is interpreted as a field path and "$$" is +# interpreted as an empty variable name. +INDEXOFCP_DOLLAR_SIGN_ERROR_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "dollar_bare_substring", + args=["a$b", "$"], + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$indexOfCP should reject bare '$' as field path in substring", + ), + IndexOfCPTest( + "dollar_double_substring", + args=["hello", "$$"], + error_code=FAILED_TO_PARSE_ERROR, + msg="$indexOfCP should reject '$$' as empty variable name in substring", + ), +] + +INDEXOFCP_INVALID_ARG_TESTS = ( + INDEXOFCP_ARITY_TESTS + + INDEXOFCP_SYNTAX_TESTS + + INDEXOFCP_EXPR_TYPE_ERROR_TESTS + + INDEXOFCP_DOLLAR_SIGN_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_INVALID_ARG_TESTS)) +def test_indexofcp_invalid_args(collection, test_case: IndexOfCPTest): + """Test $indexOfCP invalid argument handling.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py new file mode 100644 index 00000000..8ff62b52 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import INDEXOFCP_SUBSTRING_TYPE_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import MISSING +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Argument shapes for null/missing first-arg tests. _PLACEHOLDER is replaced with None or MISSING. +_PLACEHOLDER = object() +_NULL_PATTERNS = [ + ([_PLACEHOLDER, "hello"], "first_arg"), + ([_PLACEHOLDER, "hello", 0], "first_arg_with_start"), + ([_PLACEHOLDER, "hello", 0, 5], "first_arg_with_start_end"), + # First arg null/missing takes precedence over second arg null (which would otherwise error). + ([_PLACEHOLDER, None], "precedence_second_null"), + # First arg null/missing takes precedence over start null (which would otherwise error). + ([_PLACEHOLDER, "sub", None], "precedence_start_null"), + # First arg null/missing takes precedence over end null (which would otherwise error). + ([_PLACEHOLDER, "sub", 0, None], "precedence_end_null"), + # First arg null/missing takes precedence over errors from other null args. + ([_PLACEHOLDER, None, None], "precedence_all_null"), + ([_PLACEHOLDER, None, None, None], "precedence_all_four_null"), +] + + +def _build_null_tests(null_value, prefix) -> list[IndexOfCPTest]: + return [ + IndexOfCPTest( + f"{prefix}_{suffix}", + args=[null_value if a is _PLACEHOLDER else a for a in args], + expected=None, + msg=f"$indexOfCP should return null when {prefix} {suffix}", + ) + for args, suffix in _NULL_PATTERNS + ] + + +# Property [Null Behavior]: when the first argument is null, the result is null regardless of other +# arguments. +INDEXOFCP_NULL_TESTS = _build_null_tests(None, "null") + + +# Property [Missing Behavior]: when the first argument references a missing field, the result is +# null regardless of other arguments. +INDEXOFCP_MISSING_TESTS = _build_null_tests(MISSING, "missing") + +# Property [Null and Missing Errors - Precedence]: second arg error takes precedence over +# third/fourth arg errors. +INDEXOFCP_NULL_MISSING_PRECEDENCE_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "null_err_second_precedes_third", + args=["hello", None, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP null substring error should precede null start error", + ), + IndexOfCPTest( + "null_err_second_precedes_fourth", + args=["hello", None, 0, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP null substring error should precede null end error", + ), + IndexOfCPTest( + "null_err_second_precedes_all", + args=["hello", None, None, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP null substring error should precede all other null errors", + ), + IndexOfCPTest( + "missing_err_second_precedes_third", + args=["hello", MISSING, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP missing substring error should precede null start error", + ), + IndexOfCPTest( + "missing_err_second_precedes_fourth", + args=["hello", MISSING, 0, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP missing substring error should precede null end error", + ), + IndexOfCPTest( + "missing_err_second_precedes_all", + args=["hello", MISSING, None, None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP missing substring error should precede all other null errors", + ), +] + +INDEXOFCP_NULL_AND_MISSING_TESTS = ( + INDEXOFCP_NULL_TESTS + INDEXOFCP_MISSING_TESTS + INDEXOFCP_NULL_MISSING_PRECEDENCE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_NULL_AND_MISSING_TESTS)) +def test_indexofcp_null(collection, test_case: IndexOfCPTest): + """Test $indexOfCP null and missing behavior.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py new file mode 100644 index 00000000..53248c7c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [First Occurrence]: when the substring appears multiple times, the result is the code +# point index of the first occurrence. +INDEXOFCP_FIRST_OCCURRENCE_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "first_occ_from_start", + args=["abcabc", "abc"], + expected=0, + msg="$indexOfCP should find first occurrence at start", + ), + IndexOfCPTest( + "first_occ_start_skips_first", + args=["abcabc", "abc", 1], + expected=3, + msg="$indexOfCP should find second occurrence when start skips first", + ), + IndexOfCPTest( + "first_occ_repeated_char", + args=["aaaa", "a"], + expected=0, + msg="$indexOfCP should find first of repeated single characters", + ), + IndexOfCPTest( + "first_occ_repeated_char_with_start", + args=["aaaa", "a", 2], + expected=2, + msg="$indexOfCP should find first occurrence at start offset in repeated chars", + ), + IndexOfCPTest( + "first_occ_with_end", + args=["abcabc", "abc", 0, 6], + expected=0, + msg="$indexOfCP should find first occurrence within end boundary", + ), + IndexOfCPTest( + "first_occ_start_and_end", + args=["abcabc", "abc", 1, 6], + expected=3, + msg="$indexOfCP should find occurrence within start and end range", + ), +] + + +# Property [Empty Substring]: empty substring behavior depends on start relative to string code +# point length. +INDEXOFCP_EMPTY_SUBSTRING_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "empty_sub_empty_string", + args=["", ""], + expected=0, + msg="$indexOfCP should return 0 for empty substring in empty string", + ), + IndexOfCPTest( + "empty_sub_non_empty_string", + args=["hello", ""], + expected=0, + msg="$indexOfCP should return 0 for empty substring in non-empty string", + ), + IndexOfCPTest( + "empty_sub_with_start_at_middle_cp", + args=["hello", "", 3], + expected=3, + msg="$indexOfCP should return start for empty substring at middle code point", + ), + IndexOfCPTest( + "empty_sub_start_at_last_cp", + args=["hello", "", 4], + expected=4, + msg="$indexOfCP should return start for empty substring at last code point", + ), + # For $indexOfCP, start >= code point length returns -1 (unlike $indexOfBytes which returns + # start when start == byte length). + IndexOfCPTest( + "empty_sub_start_at_cp_length", + args=["hello", "", 5], + expected=-1, + msg="$indexOfCP should return -1 for empty substring at code point length", + ), + IndexOfCPTest( + "empty_sub_start_beyond_cp_length", + args=["hello", "", 6], + expected=-1, + msg="$indexOfCP should return -1 for empty substring beyond code point length", + ), + # Multi-byte string: "café" is 4 code points (é is 1 code point). + IndexOfCPTest( + "empty_sub_multibyte_start_at_cp_length", + args=["café", "", 4], + expected=-1, + msg="$indexOfCP should return -1 for empty substring at multi-byte string cp length", + ), + IndexOfCPTest( + "empty_sub_multibyte_start_beyond_cp_length", + args=["café", "", 5], + expected=-1, + msg="$indexOfCP should return -1 for empty substring beyond multi-byte string cp length", + ), + # Empty search range: start == end returns -1 regardless of position, start > end also returns + # -1. + IndexOfCPTest( + "empty_sub_edge_start_eq_end_zero", + args=["hello", "", 0, 0], + expected=-1, + msg="$indexOfCP should return -1 for empty substring with start=end=0", + ), + IndexOfCPTest( + "empty_sub_edge_start_eq_end_mid", + args=["hello", "", 2, 2], + expected=-1, + msg="$indexOfCP should return -1 for empty substring with start=end mid-string", + ), + # start == end == cp_length also returns -1. + IndexOfCPTest( + "empty_sub_edge_start_eq_end_at_len", + args=["hello", "", 5, 5], + expected=-1, + msg="$indexOfCP should return -1 for empty substring with start=end at cp length", + ), + IndexOfCPTest( + "empty_sub_edge_start_gt_end", + args=["hello", "", 3, 1], + expected=-1, + msg="$indexOfCP should return -1 for empty substring when start > end", + ), + IndexOfCPTest( + "empty_sub_edge_start_gt_end_bounds", + args=["hello", "", 5, 0], + expected=-1, + msg="$indexOfCP should return -1 for empty substring when start at length > end", + ), +] + + +# Property [Start and End Range]: start and end constrain the code point range searched. +INDEXOFCP_RANGE_TESTS: list[IndexOfCPTest] = [ + # "lo" starts at code point 3, but end=3 means only code points 0-2 are searched. + IndexOfCPTest( + "range_end_excludes_match", + args=["hello", "lo", 0, 3], + expected=-1, + msg="$indexOfCP should return -1 when end excludes the match position", + ), + IndexOfCPTest( + "range_end_beyond_length", + args=["hello", "lo", 0, 100], + expected=3, + msg="$indexOfCP should find match when end exceeds string length", + ), + IndexOfCPTest( + "range_start_greater_than_end", + args=["hello", "lo", 4, 2], + expected=-1, + msg="$indexOfCP should return -1 when start > end", + ), + IndexOfCPTest( + "range_start_equals_end", + args=["hello", "lo", 3, 3], + expected=-1, + msg="$indexOfCP should return -1 when start equals end", + ), + IndexOfCPTest( + "range_start_at_cp_length", + args=["hello", "lo", 5], + expected=-1, + msg="$indexOfCP should return -1 when start is at code point length", + ), + IndexOfCPTest( + "range_start_at_last_cp", + args=["hello", "o", 4], + expected=4, + msg="$indexOfCP should find single char at last code point position", + ), + IndexOfCPTest( + "range_start_beyond_cp_length", + args=["hello", "lo", 10], + expected=-1, + msg="$indexOfCP should return -1 when start is beyond code point length", + ), + # "llo" starts at code point 2. Unlike $indexOfBytes, $indexOfCP finds the match as long as it + # starts within the range. + IndexOfCPTest( + "range_match_starts_within_end", + args=["hello", "llo", 0, 4], + expected=2, + msg="$indexOfCP should find match that starts within end boundary", + ), + IndexOfCPTest( + "range_match_fits_within_end", + args=["hello", "llo", 0, 5], + expected=2, + msg="$indexOfCP should find match that fits within end boundary", + ), +] + + +# Property [Overlapping Matches]: when overlapping matches exist, the result is the code point index +# of the first overlapping match. +INDEXOFCP_OVERLAPPING_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "overlap_ascii", + args=["aaa", "aa"], + expected=0, + msg="$indexOfCP should return first overlapping match for ASCII", + ), + IndexOfCPTest( + "overlap_2byte", + args=["éééé", "éé"], + expected=0, + msg="$indexOfCP should return first overlapping match for 2-byte chars", + ), + IndexOfCPTest( + "overlap_3byte", + args=["中中中中", "中中"], + expected=0, + msg="$indexOfCP should return first overlapping match for 3-byte chars", + ), +] + + +# Property [Edge Cases]: the operator handles boundary inputs correctly. +INDEXOFCP_EDGE_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "edge_empty_string", + args=["", "hello"], + expected=-1, + msg="$indexOfCP should return -1 searching non-empty substring in empty string", + ), + IndexOfCPTest( + "edge_at_end", + args=["hello", "o"], + expected=4, + msg="$indexOfCP should find substring at end of string", + ), + IndexOfCPTest( + "edge_equals_entire", + args=["hello", "hello"], + expected=0, + msg="$indexOfCP should return 0 when substring equals entire string", + ), + IndexOfCPTest( + "edge_longer_than_string", + args=["hi", "hello"], + expected=-1, + msg="$indexOfCP should return -1 when substring is longer than string", + ), + # Number-like strings are treated as strings, not coerced to numeric types. + IndexOfCPTest( + "edge_number_like_zero", + args=["a0b3", "0"], + expected=1, + msg="$indexOfCP should find '0' as a string, not coerce to number", + ), + IndexOfCPTest( + "edge_number_like_digit", + args=["a0b3", "3"], + expected=3, + msg="$indexOfCP should find '3' as a string, not coerce to number", + ), +] + +INDEXOFCP_SEARCH_TESTS = ( + INDEXOFCP_FIRST_OCCURRENCE_TESTS + + INDEXOFCP_EMPTY_SUBSTRING_TESTS + + INDEXOFCP_RANGE_TESTS + + INDEXOFCP_OVERLAPPING_TESTS + + INDEXOFCP_EDGE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_SEARCH_TESTS)) +def test_indexofcp_search(collection, test_case: IndexOfCPTest): + """Test $indexOfCP search behavior.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py new file mode 100644 index 00000000..7c983b5c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [String Size Limit - Success]: string and substring arguments just under the size limit +# are accepted. +INDEXOFCP_SIZE_LIMIT_SUCCESS_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "size_string_one_under", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 1), "a"], + expected=0, + msg="$indexOfCP should accept string one byte under the size limit", + ), + IndexOfCPTest( + "size_substr_one_under", + args=["hello", "a" * (STRING_SIZE_LIMIT_BYTES - 1)], + expected=-1, + msg="$indexOfCP should accept substring one byte under the size limit", + ), + # 2-byte chars: one byte under the limit. Limit is byte-based, not code-point-based. + IndexOfCPTest( + "size_string_one_under_2byte", + args=["é" * ((STRING_SIZE_LIMIT_BYTES - 1) // 2) + "a", "a"], + expected=(STRING_SIZE_LIMIT_BYTES - 1) // 2, + msg="$indexOfCP should accept 2-byte char string one byte under limit", + ), + # Found at end of a large string. + IndexOfCPTest( + "size_found_at_end", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 2) + "b", "b"], + expected=STRING_SIZE_LIMIT_BYTES - 2, + msg="$indexOfCP should find match at end of a large string", + ), + # Not found in a large string. + IndexOfCPTest( + "size_not_found", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 1), "b"], + expected=-1, + msg="$indexOfCP should return -1 for no match in a large string", + ), +] + + +# Property [String Size Limit - Error]: a string or substring argument at or above the size limit +# produces an error. +INDEXOFCP_SIZE_LIMIT_ERROR_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "size_string_at_limit", + args=["a" * STRING_SIZE_LIMIT_BYTES, "b"], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfCP should reject string at the size limit", + ), + IndexOfCPTest( + "size_substr_at_limit", + args=["hello", "a" * STRING_SIZE_LIMIT_BYTES], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfCP should reject substring at the size limit", + ), + # 2-byte chars: exactly at the limit. Limit is byte-based, not code-point-based. + IndexOfCPTest( + "size_string_at_limit_2byte", + args=["é" * (STRING_SIZE_LIMIT_BYTES // 2), "b"], + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$indexOfCP should reject 2-byte char string at the byte size limit", + ), +] + +INDEXOFCP_SIZE_LIMIT_TESTS = INDEXOFCP_SIZE_LIMIT_SUCCESS_TESTS + INDEXOFCP_SIZE_LIMIT_ERROR_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_SIZE_LIMIT_TESTS)) +def test_indexofcp_size_limit(collection, test_case: IndexOfCPTest): + """Test $indexOfCP size limit behavior.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py new file mode 100644 index 00000000..5c0fb7ed --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py @@ -0,0 +1,474 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + INDEXOF_INDEX_TYPE_ERROR, + INDEXOFCP_STRING_TYPE_ERROR, + INDEXOFCP_SUBSTRING_TYPE_ERROR, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + MISSING, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Type Strictness]: arguments of incorrect type produce an error. +INDEXOFCP_TYPE_ERROR_TESTS: list[IndexOfCPTest] = [ + # First arg: non-string, non-null types + IndexOfCPTest( + "type_first_array", + args=[["a"], "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject array as string argument", + ), + IndexOfCPTest( + "type_first_binary", + args=[Binary(b"data"), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject binary as string argument", + ), + IndexOfCPTest( + "type_first_bool", + args=[True, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject bool as string argument", + ), + IndexOfCPTest( + "type_first_date", + args=[datetime(2024, 1, 1, tzinfo=timezone.utc), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject date as string argument", + ), + IndexOfCPTest( + "type_first_decimal128", + args=[DECIMAL128_ONE_AND_HALF, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject decimal128 as string argument", + ), + IndexOfCPTest( + "type_first_float", + args=[3.14, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject float as string argument", + ), + IndexOfCPTest( + "type_first_int", + args=[42, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject int as string argument", + ), + IndexOfCPTest( + "type_first_long", + args=[Int64(42), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject long as string argument", + ), + IndexOfCPTest( + "type_first_maxkey", + args=[MaxKey(), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject maxkey as string argument", + ), + IndexOfCPTest( + "type_first_minkey", + args=[MinKey(), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject minkey as string argument", + ), + IndexOfCPTest( + "type_first_object", + args=[{"a": 1}, "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject object as string argument", + ), + IndexOfCPTest( + "type_first_objectid", + args=[ObjectId("507f1f77bcf86cd799439011"), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject objectid as string argument", + ), + IndexOfCPTest( + "type_first_regex", + args=[Regex("pattern"), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject regex as string argument", + ), + IndexOfCPTest( + "type_first_timestamp", + args=[Timestamp(1, 1), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject timestamp as string argument", + ), + IndexOfCPTest( + "type_first_code", + args=[Code("function() {}"), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject javascript code as string argument", + ), + IndexOfCPTest( + "type_first_code_scope", + args=[Code("function() {}", {"x": 1}), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject code with scope as string argument", + ), + IndexOfCPTest( + "type_first_binary_uuid", + args=[Binary(b"data", 4), "sub"], + error_code=INDEXOFCP_STRING_TYPE_ERROR, + msg="$indexOfCP should reject binary UUID as string argument", + ), + # Second arg: non-string types (including null and missing) + IndexOfCPTest( + "type_second_array", + args=["hello", ["a"]], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject array as substring argument", + ), + IndexOfCPTest( + "type_second_binary", + args=["hello", Binary(b"data")], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject binary as substring argument", + ), + IndexOfCPTest( + "type_second_bool", + args=["hello", True], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject bool as substring argument", + ), + IndexOfCPTest( + "type_second_date", + args=["hello", datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject date as substring argument", + ), + IndexOfCPTest( + "type_second_decimal128", + args=["hello", DECIMAL128_ONE_AND_HALF], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject decimal128 as substring argument", + ), + IndexOfCPTest( + "type_second_float", + args=["hello", 3.14], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject float as substring argument", + ), + IndexOfCPTest( + "type_second_int", + args=["hello", 42], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject int as substring argument", + ), + IndexOfCPTest( + "type_second_long", + args=["hello", Int64(42)], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject long as substring argument", + ), + IndexOfCPTest( + "type_second_maxkey", + args=["hello", MaxKey()], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject maxkey as substring argument", + ), + IndexOfCPTest( + "type_second_minkey", + args=["hello", MinKey()], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject minkey as substring argument", + ), + IndexOfCPTest( + "type_second_null", + args=["hello", None], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject null as substring argument", + ), + IndexOfCPTest( + "type_second_object", + args=["hello", {"a": 1}], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject object as substring argument", + ), + IndexOfCPTest( + "type_second_objectid", + args=["hello", ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject objectid as substring argument", + ), + IndexOfCPTest( + "type_second_regex", + args=["hello", Regex("pattern")], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject regex as substring argument", + ), + IndexOfCPTest( + "type_second_timestamp", + args=["hello", Timestamp(1, 1)], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject timestamp as substring argument", + ), + IndexOfCPTest( + "type_second_code", + args=["hello", Code("function() {}")], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject javascript code as substring argument", + ), + IndexOfCPTest( + "type_second_code_scope", + args=["hello", Code("function() {}", {"x": 1})], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject code with scope as substring argument", + ), + IndexOfCPTest( + "type_second_binary_uuid", + args=["hello", Binary(b"data", 4)], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject binary UUID as substring argument", + ), + IndexOfCPTest( + "type_second_missing", + args=["hello", MISSING], + error_code=INDEXOFCP_SUBSTRING_TYPE_ERROR, + msg="$indexOfCP should reject missing field as substring argument", + ), + # Third arg (start): non-numeric types + IndexOfCPTest( + "type_third_array", + args=["hello", "h", ["a"]], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject array as start argument", + ), + IndexOfCPTest( + "type_third_binary", + args=["hello", "h", Binary(b"data")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject binary as start argument", + ), + IndexOfCPTest( + "type_third_bool", + args=["hello", "h", True], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject bool as start argument", + ), + IndexOfCPTest( + "type_third_date", + args=["hello", "h", datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject date as start argument", + ), + IndexOfCPTest( + "type_third_maxkey", + args=["hello", "h", MaxKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject maxkey as start argument", + ), + IndexOfCPTest( + "type_third_minkey", + args=["hello", "h", MinKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject minkey as start argument", + ), + IndexOfCPTest( + "type_third_null", + args=["hello", "h", None], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject null as start argument", + ), + IndexOfCPTest( + "type_third_object", + args=["hello", "h", {"a": 1}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject object as start argument", + ), + IndexOfCPTest( + "type_third_objectid", + args=["hello", "h", ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject objectid as start argument", + ), + IndexOfCPTest( + "type_third_regex", + args=["hello", "h", Regex("pattern")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject regex as start argument", + ), + IndexOfCPTest( + "type_third_timestamp", + args=["hello", "h", Timestamp(1, 1)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject timestamp as start argument", + ), + IndexOfCPTest( + "type_third_code", + args=["hello", "h", Code("function() {}")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject javascript code as start argument", + ), + IndexOfCPTest( + "type_third_code_scope", + args=["hello", "h", Code("function() {}", {"x": 1})], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject code with scope as start argument", + ), + IndexOfCPTest( + "type_third_binary_uuid", + args=["hello", "h", Binary(b"data", 4)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject binary UUID as start argument", + ), + IndexOfCPTest( + "type_third_string", + args=["hello", "h", "x"], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject string as start argument", + ), + IndexOfCPTest( + "type_third_missing", + args=["hello", "h", MISSING], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject missing field as start argument", + ), + # Fourth arg (end): non-numeric types + IndexOfCPTest( + "type_fourth_array", + args=["hello", "h", 0, ["a"]], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject array as end argument", + ), + IndexOfCPTest( + "type_fourth_binary", + args=["hello", "h", 0, Binary(b"data")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject binary as end argument", + ), + IndexOfCPTest( + "type_fourth_bool", + args=["hello", "h", 0, True], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject bool as end argument", + ), + IndexOfCPTest( + "type_fourth_date", + args=["hello", "h", 0, datetime(2024, 1, 1, tzinfo=timezone.utc)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject date as end argument", + ), + IndexOfCPTest( + "type_fourth_maxkey", + args=["hello", "h", 0, MaxKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject maxkey as end argument", + ), + IndexOfCPTest( + "type_fourth_minkey", + args=["hello", "h", 0, MinKey()], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject minkey as end argument", + ), + IndexOfCPTest( + "type_fourth_null", + args=["hello", "h", 0, None], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject null as end argument", + ), + IndexOfCPTest( + "type_fourth_object", + args=["hello", "h", 0, {"a": 1}], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject object as end argument", + ), + IndexOfCPTest( + "type_fourth_objectid", + args=["hello", "h", 0, ObjectId("507f1f77bcf86cd799439011")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject objectid as end argument", + ), + IndexOfCPTest( + "type_fourth_regex", + args=["hello", "h", 0, Regex("pattern")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject regex as end argument", + ), + IndexOfCPTest( + "type_fourth_timestamp", + args=["hello", "h", 0, Timestamp(1, 1)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject timestamp as end argument", + ), + IndexOfCPTest( + "type_fourth_code", + args=["hello", "h", 0, Code("function() {}")], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject javascript code as end argument", + ), + IndexOfCPTest( + "type_fourth_code_scope", + args=["hello", "h", 0, Code("function() {}", {"x": 1})], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject code with scope as end argument", + ), + IndexOfCPTest( + "type_fourth_binary_uuid", + args=["hello", "h", 0, Binary(b"data", 4)], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject binary UUID as end argument", + ), + IndexOfCPTest( + "type_fourth_string", + args=["hello", "h", 0, "x"], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject string as end argument", + ), + IndexOfCPTest( + "type_fourth_missing", + args=["hello", "h", 0, MISSING], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject missing field as end argument", + ), + # Non-integral floats and decimals are rejected as index arguments even though + # whole-number values of these types are accepted (see INDEXOFCP_INDEX_TYPE_TESTS). + IndexOfCPTest( + "type_third_non_integral_float", + args=["hello", "h", 3.14], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject non-integral float as start", + ), + IndexOfCPTest( + "type_third_non_integral_decimal128", + args=["hello", "h", DECIMAL128_ONE_AND_HALF], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject non-integral Decimal128 as start", + ), + IndexOfCPTest( + "type_fourth_non_integral_float", + args=["hello", "h", 0, 3.14], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject non-integral float as end", + ), + IndexOfCPTest( + "type_fourth_non_integral_decimal128", + args=["hello", "h", 0, DECIMAL128_ONE_AND_HALF], + error_code=INDEXOF_INDEX_TYPE_ERROR, + msg="$indexOfCP should reject non-integral Decimal128 as end", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_TYPE_ERROR_TESTS)) +def test_indexofcp_type_errors(collection, test_case: IndexOfCPTest): + """Test $indexOfCP type error behavior.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py new file mode 100644 index 00000000..214d58dc --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + IndexOfCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project_with_insert, +) + +# Property [Expression Arguments]: all argument positions accept expressions that resolve to the +# expected type. +INDEXOFCP_EXPR_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "expr_first_arg", + args=[{"$concat": ["hel", "lo"]}, "lo"], + expected=3, + msg="$indexOfCP should accept expression for string argument", + ), + IndexOfCPTest( + "expr_second_arg", + args=["hello", {"$concat": ["l", "o"]}], + expected=3, + msg="$indexOfCP should accept expression for substring argument", + ), + IndexOfCPTest( + "expr_third_arg", + args=["hello", "lo", {"$add": [1, 2]}], + expected=3, + msg="$indexOfCP should accept expression for start argument", + ), + IndexOfCPTest( + "expr_fourth_arg", + args=["hello", "lo", 0, {"$add": [3, 2]}], + expected=3, + msg="$indexOfCP should accept expression for end argument", + ), +] + +# Property [Dollar Sign Handling - Success]: using $literal for a dollar sign substring avoids field +# path interpretation and finds the character correctly. +INDEXOFCP_DOLLAR_SIGN_SUCCESS_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "dollar_literal_finds_dollar", + args=["a$b", {"$literal": "$"}], + expected=1, + msg="$indexOfCP should find dollar sign via $literal", + ), +] + + +INDEXOFCP_USAGE_TESTS = INDEXOFCP_EXPR_TESTS + INDEXOFCP_DOLLAR_SIGN_SUCCESS_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_USAGE_TESTS)) +def test_indexofcp_cases(collection, test_case: IndexOfCPTest): + """Test $indexOfCP cases.""" + result = execute_expression(collection, {"$indexOfCP": test_case.args}) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) + + +# Property [Document Field References]: $indexOfCP works with field references +# from inserted documents, not just inline literals. +def test_indexofcp_document_fields(collection): + """Test $indexOfCP reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "hello", "sub": "lo"}, + {"result": {"$indexOfCP": ["$s", "$sub"]}}, + ) + assertSuccess( + result, + [{"result": 3}], + msg="$indexOfCP should find substring from document field references", + ) + + +# Property [Nested Field Paths]: $indexOfCP resolves dotted field paths in nested documents. +def test_indexofcp_nested_field_paths(collection): + """Test $indexOfCP reads values from nested document field paths.""" + result = execute_project_with_insert( + collection, + {"a": {"b": "hello"}, "c": {"d": "lo"}}, + {"result": {"$indexOfCP": ["$a.b", "$c.d"]}}, + ) + assertSuccess( + result, + [{"result": 3}], + msg="$indexOfCP should find substring from nested field paths", + ) + + +# Property [Return Type]: the result is always type int, including when not found and for large +# indices. +INDEXOFCP_RETURN_TYPE_TESTS: list[IndexOfCPTest] = [ + IndexOfCPTest( + "return_type_two_args", + args=["hello", "ell"], + msg="$indexOfCP should return int type with two args", + ), + IndexOfCPTest( + "return_type_three_args", + args=["hello", "lo", 2], + msg="$indexOfCP should return int type with three args", + ), + IndexOfCPTest( + "return_type_four_args", + args=["hello", "ll", 0, 5], + msg="$indexOfCP should return int type with four args", + ), + IndexOfCPTest( + "return_type_not_found", + args=["hello", "xyz"], + msg="$indexOfCP should return int type when not found", + ), + IndexOfCPTest( + "return_type_large_index", + args=["a" * (STRING_SIZE_LIMIT_BYTES - 2) + "b", "b"], + msg="$indexOfCP should return int type for large index value", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(INDEXOFCP_RETURN_TYPE_TESTS)) +def test_indexofcp_return_type(collection, test_case: IndexOfCPTest): + """Test $indexOfCP result is always type int.""" + result = execute_expression(collection, {"$type": {"$indexOfCP": test_case.args}}) + assertSuccess(result, [{"result": "int"}], msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/indexOfCP_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/indexOfCP_common.py new file mode 100644 index 00000000..4c4c77c1 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/utils/indexOfCP_common.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from documentdb_tests.framework.test_case import BaseTestCase + + +@dataclass(frozen=True) +class IndexOfCPTest(BaseTestCase): + """Test case for $indexOfCP operator.""" + + # Uses args because start and end are optional positional parameters. + # Named fields would be ambiguous about whether an unset optional + # should be omitted from the array or passed as None. + args: list[Any] = None # type: ignore[assignment] diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py new file mode 100644 index 00000000..ed5368ac --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) + +# Property [Core Behavior]: returns the number of UTF-8 encoded bytes in the input string. +STRLENBYTES_CORE_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "core_empty", value="", expected=0, msg="$strLenBytes of empty string should be 0" + ), + StrLenBytesTest("core_space", value=" ", expected=1, msg="$strLenBytes of space should be 1"), + StrLenBytesTest( + "core_multiple_space", + value=" ", + expected=3, + msg="$strLenBytes of three spaces should be 3", + ), + StrLenBytesTest( + "core_ascii_word", + value="hello", + expected=5, + msg="$strLenBytes of ASCII word should equal character count", + ), + StrLenBytesTest( + "core_newline", value="\n", expected=1, msg="$strLenBytes of newline should be 1" + ), + StrLenBytesTest("core_tab", value="\t", expected=1, msg="$strLenBytes of tab should be 1"), + StrLenBytesTest( + "core_cr", value="\r", expected=1, msg="$strLenBytes of carriage return should be 1" + ), + StrLenBytesTest( + "core_null_byte", value="\x00", expected=1, msg="$strLenBytes of null byte should be 1" + ), + StrLenBytesTest("core_crlf", value="\r\n", expected=2, msg="$strLenBytes of CRLF should be 2"), + # 2-byte: Latin extended, Greek. + StrLenBytesTest( + "core_2byte_e_acute", value="é", expected=2, msg="$strLenBytes of 2-byte é should be 2" + ), + StrLenBytesTest( + "core_2byte_n_tilde", value="ñ", expected=2, msg="$strLenBytes of 2-byte ñ should be 2" + ), + StrLenBytesTest( + "core_2byte_greek", + value="λ", + expected=2, + msg="$strLenBytes of 2-byte Greek lambda should be 2", + ), + # 3-byte: CJK, Euro sign, BOM, ZWJ. + StrLenBytesTest( + "core_3byte_cjk", + value="寿", + expected=3, + msg="$strLenBytes of 3-byte CJK character should be 3", + ), + StrLenBytesTest( + "core_3byte_euro", value="€", expected=3, msg="$strLenBytes of 3-byte Euro sign should be 3" + ), + StrLenBytesTest( + "core_3byte_bom", value="\ufeff", expected=3, msg="$strLenBytes of 3-byte BOM should be 3" + ), + StrLenBytesTest( + "core_3byte_zwj", value="\u200d", expected=3, msg="$strLenBytes of 3-byte ZWJ should be 3" + ), + # 4-byte: emoji, math symbols. + StrLenBytesTest( + "core_4byte_emoji", value="😀", expected=4, msg="$strLenBytes of 4-byte emoji should be 4" + ), + StrLenBytesTest( + "core_4byte_math", + value="𝜋", + expected=4, + msg="$strLenBytes of 4-byte math symbol should be 4", + ), + StrLenBytesTest( + "core_4byte_party", + value="🎉", + expected=4, + msg="$strLenBytes of 4-byte party emoji should be 4", + ), + # Mixed byte widths. + StrLenBytesTest( + "core_mixed_all_widths", + value="aé€😀", + expected=10, + msg="$strLenBytes should sum bytes across 1/2/3/4-byte characters", + ), + StrLenBytesTest( + "core_mixed_latin", + value="cafétéria", + expected=11, + msg="$strLenBytes should count mixed ASCII and 2-byte chars correctly", + ), + StrLenBytesTest( + "core_mixed_spanish", + value="jalapeño", + expected=9, + msg="$strLenBytes should count ñ as 2 bytes in mixed string", + ), + # Precomposed U+00E9 (2 bytes) vs decomposed U+0065 + U+0301 (1 + 2 = 3 bytes). + StrLenBytesTest( + "core_precomposed", + value="\u00e9", + expected=2, + msg="$strLenBytes of precomposed é should be 2 bytes", + ), + StrLenBytesTest( + "core_decomposed", + value="e\u0301", + expected=3, + msg="$strLenBytes of decomposed é (e + combining accent) should be 3 bytes", + ), + # ZWJ emoji sequence: 3 emoji (4 bytes each) + 2 ZWJ (3 bytes each) = 18. + StrLenBytesTest( + "core_zwj_emoji", + value="👨\u200d👩\u200d👧", + expected=18, + msg="$strLenBytes of ZWJ emoji sequence should count all bytes including joiners", + ), + # Mixed scripts. + StrLenBytesTest( + "core_accent_word", + value="café", + expected=5, + msg="$strLenBytes of 'café' should be 5 (3 ASCII + 1 two-byte)", + ), + StrLenBytesTest( + "core_cjk_word", + value="寿司", + expected=6, + msg="$strLenBytes of two CJK characters should be 6", + ), + StrLenBytesTest( + "core_mixed_scripts", + value="hello 世界", + expected=12, + msg="$strLenBytes should sum ASCII and CJK byte widths", + ), +] + +# Property [Encoding and Character Handling]: characters at UTF-8 encoding boundaries and special +# Unicode categories produce correct byte counts. +STRLENBYTES_ENCODING_TESTS: list[StrLenBytesTest] = [ + # U+00A0 non-breaking space (2 bytes). + StrLenBytesTest( + "encoding_nbsp", + value="\u00a0", + expected=2, + msg="$strLenBytes of non-breaking space should be 2", + ), + # U+2000 en space (3 bytes). + StrLenBytesTest( + "encoding_en_space", value="\u2000", expected=3, msg="$strLenBytes of en space should be 3" + ), + # U+2003 em space (3 bytes). + StrLenBytesTest( + "encoding_em_space", value="\u2003", expected=3, msg="$strLenBytes of em space should be 3" + ), + # U+0001 SOH control character (1 byte). + StrLenBytesTest( + "encoding_control_soh", + value="\x01", + expected=1, + msg="$strLenBytes of SOH control character should be 1", + ), + # U+001F US control character (1 byte). + StrLenBytesTest( + "encoding_control_us", + value="\x1f", + expected=1, + msg="$strLenBytes of US control character should be 1", + ), + # U+200B zero-width space (3 bytes). + StrLenBytesTest( + "encoding_zero_width_space", + value="\u200b", + expected=3, + msg="$strLenBytes of zero-width space should be 3", + ), + # U+200E left-to-right mark (3 bytes). + StrLenBytesTest( + "encoding_ltr_mark", + value="\u200e", + expected=3, + msg="$strLenBytes of left-to-right mark should be 3", + ), + # U+200F right-to-left mark (3 bytes). + StrLenBytesTest( + "encoding_rtl_mark", + value="\u200f", + expected=3, + msg="$strLenBytes of right-to-left mark should be 3", + ), + # U+D7FF: last codepoint before surrogates (3 bytes). + StrLenBytesTest( + "encoding_boundary_d7ff", + value="\ud7ff", + expected=3, + msg="$strLenBytes of U+D7FF (last pre-surrogate) should be 3", + ), + # U+E000: first private use area codepoint (3 bytes). + StrLenBytesTest( + "encoding_boundary_e000", + value="\ue000", + expected=3, + msg="$strLenBytes of U+E000 (first PUA) should be 3", + ), + # U+FFFF: last BMP codepoint (3 bytes). + StrLenBytesTest( + "encoding_boundary_ffff", + value="\uffff", + expected=3, + msg="$strLenBytes of U+FFFF (last BMP) should be 3", + ), + # U+10000: first supplementary plane codepoint (4 bytes). + StrLenBytesTest( + "encoding_boundary_10000", + value="\U00010000", + expected=4, + msg="$strLenBytes of U+10000 (first supplementary) should be 4", + ), + # U+10FFFF: last valid Unicode codepoint (4 bytes). + StrLenBytesTest( + "encoding_boundary_10ffff", + value="\U0010ffff", + expected=4, + msg="$strLenBytes of U+10FFFF (last valid codepoint) should be 4", + ), + # U+10400 Deseret capital long I (4 bytes). + StrLenBytesTest( + "encoding_deseret", + value="\U00010400", + expected=4, + msg="$strLenBytes of Deseret character should be 4", + ), + # German sharp s (2 bytes). + StrLenBytesTest( + "encoding_sharp_s", value="ß", expected=2, msg="$strLenBytes of German sharp s should be 2" + ), + # U+FB01 fi ligature (3 bytes). + StrLenBytesTest( + "encoding_fi_ligature", + value="\ufb01", + expected=3, + msg="$strLenBytes of fi ligature should be 3", + ), + # U+0131 Turkish dotless i (2 bytes). + StrLenBytesTest( + "encoding_dotless_i", + value="\u0131", + expected=2, + msg="$strLenBytes of Turkish dotless i should be 2", + ), +] + + +# Property [Embedded Null Bytes]: null bytes in various positions do not cause early string +# termination. +STRLENBYTES_NULL_BYTE_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "null_byte_at_start", + value="\x00xyz", + expected=4, + msg="$strLenBytes should not terminate early on leading null byte", + ), + StrLenBytesTest( + "null_byte_at_end", + value="xyz\x00", + expected=4, + msg="$strLenBytes should not terminate early on trailing null byte", + ), + StrLenBytesTest( + "null_byte_multiple", + value="a\x00b\x00c\x00", + expected=6, + msg="$strLenBytes should count all bytes with interleaved null bytes", + ), + StrLenBytesTest( + "null_byte_with_multibyte", + value="寿\x00司", + expected=7, + msg="$strLenBytes should count all bytes with null byte between multibyte chars", + ), +] + +STRLENBYTES_BYTE_COUNT_TESTS = ( + STRLENBYTES_CORE_TESTS + STRLENBYTES_ENCODING_TESTS + STRLENBYTES_NULL_BYTE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_BYTE_COUNT_TESTS)) +def test_strlenbytes_cases(collection, test_case: StrLenBytesTest): + """Test $strLenBytes byte count cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py new file mode 100644 index 00000000..fc281569 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_expression_with_insert, + execute_project_with_insert, +) + +# Property [Expression Arguments]: the argument accepts any expression that resolves to a string. +STRLENBYTES_EXPR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "expr_concat", + value={"$concat": ["hel", "lo"]}, + expected=5, + msg="$strLenBytes should accept $concat expression as argument", + ), + StrLenBytesTest( + "expr_toupper", + value={"$toUpper": "hello"}, + expected=5, + msg="$strLenBytes should accept $toUpper expression as argument", + ), +] + +# Property [Array Syntax]: a literal single-element array is parsed as one argument. +STRLENBYTES_ARRAY_SYNTAX_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "expr_array_syntax_ascii", + value=["hello"], + expected=5, + msg="$strLenBytes should accept single-element array with ASCII string", + ), + StrLenBytesTest( + "expr_array_syntax_multibyte", + value=["café"], + expected=5, + msg="$strLenBytes should accept single-element array with multibyte string", + ), +] + +# Property [JSON/BSON-Meaningful Characters]: strings containing JSON/BSON structural characters are +# treated as data and each character is counted as 1 byte. +STRLENBYTES_JSON_BSON_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "json_bson_open_brace", value="{", expected=1, msg="$strLenBytes of '{' should be 1" + ), + StrLenBytesTest( + "json_bson_close_brace", value="}", expected=1, msg="$strLenBytes of '}' should be 1" + ), + StrLenBytesTest( + "json_bson_open_bracket", value="[", expected=1, msg="$strLenBytes of '[' should be 1" + ), + StrLenBytesTest( + "json_bson_close_bracket", value="]", expected=1, msg="$strLenBytes of ']' should be 1" + ), + StrLenBytesTest( + "json_bson_double_quote", + value='"', + expected=1, + msg="$strLenBytes of double quote should be 1", + ), + StrLenBytesTest( + "json_bson_backslash", value="\\", expected=1, msg="$strLenBytes of backslash should be 1" + ), + StrLenBytesTest( + "json_bson_mixed", + value='{"key": [1]}', + expected=12, + msg="$strLenBytes of JSON-like string should count each ASCII char as 1 byte", + ), +] + +# Property [Dollar Sign Handling]: $literal prevents dollar-prefixed strings from being interpreted +# as field paths. +STRLENBYTES_DOLLAR_SIGN_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "dollar_literal_hello", + value={"$literal": "$hello"}, + expected=6, + msg="$strLenBytes should count dollar-prefixed string via $literal", + ), + StrLenBytesTest( + "dollar_literal_bare", + value={"$literal": "$"}, + expected=1, + msg="$strLenBytes should count bare dollar via $literal as 1 byte", + ), +] + + +STRLENBYTES_INPUT_FORM_TESTS = ( + STRLENBYTES_EXPR_TESTS + + STRLENBYTES_ARRAY_SYNTAX_TESTS + + STRLENBYTES_JSON_BSON_TESTS + + STRLENBYTES_DOLLAR_SIGN_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_INPUT_FORM_TESTS)) +def test_strlenbytes_cases(collection, test_case: StrLenBytesTest): + """Test $strLenBytes input form cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) + + +# Property [Document Field References]: $strLenBytes works with values from document fields. +def test_strlenbytes_document_fields(collection): + """Test $strLenBytes reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "café"}, + {"result": {"$strLenBytes": "$s"}}, + ) + assertSuccess(result, [{"result": 5}], msg="$strLenBytes should read value from document field") + + +# Property [Nested Field Paths]: $strLenBytes resolves dotted field paths in nested documents. +def test_strlenbytes_nested_field_paths(collection): + """Test $strLenBytes reads values from nested document field paths.""" + result = execute_expression_with_insert( + collection, {"$strLenBytes": "$a.b"}, {"a": {"b": "café"}} + ) + assertSuccess(result, [{"result": 5}], msg="$strLenBytes should resolve nested field path") diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py new file mode 100644 index 00000000..d9d19be1 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_TYPE_MISMATCH_ERROR, + FAILED_TO_PARSE_ERROR, + INVALID_DOLLAR_FIELD_PATH, + STRLENBYTES_TYPE_ERROR, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import MISSING +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Null and Missing Errors]: null or missing arguments produce an error. +STRLENBYTES_NULL_ERROR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "null_literal", + value=None, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject null input", + ), + StrLenBytesTest( + "missing_field", + value=MISSING, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject missing field", + ), + StrLenBytesTest( + "null_expr", + value={"$literal": None}, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject null from expression", + ), + StrLenBytesTest( + "null_array", + value=[None], + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject single-element array containing null", + ), +] + +# Property [Arity Errors]: literal arrays with zero or multiple elements produce an arity error. +STRLENBYTES_ARITY_ERROR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "arity_empty_array", + value=[], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenBytes should reject empty array", + ), + StrLenBytesTest( + "arity_two_elements", + value=["a", "b"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenBytes should reject two-element array", + ), + StrLenBytesTest( + "arity_three_elements", + value=["a", "b", "c"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenBytes should reject three-element array", + ), +] + +# Property [Dollar Sign Error]: a bare "$" is interpreted as a field path and "$$" is interpreted +# as an empty variable name. +STRLENBYTES_DOLLAR_ERROR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "dollar_bare_field_path", + value="$", + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$strLenBytes should reject bare '$' as invalid field path", + ), + StrLenBytesTest( + "dollar_double", + value="$$", + error_code=FAILED_TO_PARSE_ERROR, + msg="$strLenBytes should reject '$$' as empty variable name", + ), +] + + +STRLENBYTES_NON_TYPE_ERROR_TESTS = ( + STRLENBYTES_NULL_ERROR_TESTS + STRLENBYTES_ARITY_ERROR_TESTS + STRLENBYTES_DOLLAR_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_NON_TYPE_ERROR_TESTS)) +def test_strlenbytes_cases(collection, test_case: StrLenBytesTest): + """Test $strLenBytes error cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py new file mode 100644 index 00000000..158dbba9 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project, +) + +# Property [Return Type]: the result is always an integer when the expression succeeds. +STRLENBYTES_RETURN_TYPE_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "return_type_ascii", value="hello", msg="$strLenBytes of ASCII string should return int" + ), + StrLenBytesTest( + "return_type_empty", value="", msg="$strLenBytes of empty string should return int" + ), + StrLenBytesTest( + "return_type_multibyte", + value="café", + msg="$strLenBytes of multibyte string should return int", + ), + StrLenBytesTest("return_type_emoji", value="🎉", msg="$strLenBytes of emoji should return int"), + StrLenBytesTest( + "return_type_expression", + value={"$concat": ["a", "b"]}, + msg="$strLenBytes of expression result should return int", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_RETURN_TYPE_TESTS)) +def test_strlenbytes_return_type(collection, test_case: StrLenBytesTest): + """Test $strLenBytes result is always type int.""" + result = execute_expression(collection, {"$type": {"$strLenBytes": test_case.value}}) + assertSuccess(result, [{"result": "int"}], msg=test_case.msg) + + +# Property [Byte Count Invariant]: the byte count is always >= the code point count. +STRLENBYTES_INVARIANT_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "invariant_ascii", value="hello", msg="$strLenBytes should be >= $strLenCP for ASCII" + ), + StrLenBytesTest( + "invariant_2byte", value="café", msg="$strLenBytes should be >= $strLenCP for 2-byte chars" + ), + StrLenBytesTest( + "invariant_3byte", value="寿司", msg="$strLenBytes should be >= $strLenCP for 3-byte chars" + ), + StrLenBytesTest( + "invariant_4byte", value="😀🎉", msg="$strLenBytes should be >= $strLenCP for 4-byte chars" + ), + StrLenBytesTest( + "invariant_mixed", + value="aé€😀", + msg="$strLenBytes should be >= $strLenCP for mixed byte widths", + ), + StrLenBytesTest( + "invariant_empty", value="", msg="$strLenBytes should be >= $strLenCP for empty string" + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_INVARIANT_TESTS)) +def test_strlenbytes_invariant(collection, test_case: StrLenBytesTest): + """Test $strLenBytes is always >= $strLenCP.""" + s = test_case.value + result = execute_project( + collection, + {"bytesGeCp": {"$gte": [{"$strLenBytes": s}, {"$strLenCP": s}]}}, + ) + assertSuccess(result, [{"bytesGeCp": True}], msg=test_case.msg) + + +# Property [Length Additivity]: byte length of a concatenation equals the sum of its parts. +STRLENBYTES_ADDITIVITY_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "additivity_ascii", + value=["hello", "world"], + msg="$strLenBytes of concat should equal sum of parts for ASCII", + ), + StrLenBytesTest( + "additivity_2byte", + value=["café", "naïve"], + msg="$strLenBytes of concat should equal sum of parts for 2-byte chars", + ), + StrLenBytesTest( + "additivity_mixed", + value=["寿司", "🎉"], + msg="$strLenBytes of concat should equal sum of parts for mixed byte widths", + ), + StrLenBytesTest( + "additivity_empty_left", + value=["", "hello"], + msg="$strLenBytes of concat should equal sum of parts with empty left", + ), + StrLenBytesTest( + "additivity_empty_right", + value=["hello", ""], + msg="$strLenBytes of concat should equal sum of parts with empty right", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_ADDITIVITY_TESTS)) +def test_strlenbytes_additivity(collection, test_case: StrLenBytesTest): + """Test $strLenBytes of concatenation equals sum of parts.""" + expected_bytes = sum(len(p.encode("utf-8")) for p in test_case.value) + parts = test_case.value + result = execute_project( + collection, + { + "lenConcat": {"$strLenBytes": {"$concat": parts}}, + "sumParts": {"$add": [{"$strLenBytes": p} for p in parts]}, + }, + ) + assertSuccess( + result, [{"lenConcat": expected_bytes, "sumParts": expected_bytes}], msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py new file mode 100644 index 00000000..7cacc2bf --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [String Size Limit - Success]: inputs just under the size limit succeed. +STRLENBYTES_SIZE_LIMIT_SUCCESS_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "size_one_under", + value="a" * (STRING_SIZE_LIMIT_BYTES - 1), + expected=STRING_SIZE_LIMIT_BYTES - 1, + msg="$strLenBytes should handle a string one byte under the size limit", + ), + StrLenBytesTest( + "size_one_under_3byte", + value="寿" * ((STRING_SIZE_LIMIT_BYTES - 1) // 3), + expected=(STRING_SIZE_LIMIT_BYTES - 1) // 3 * 3, + msg="$strLenBytes should handle 3-byte chars near the size limit", + ), +] + +# Property [String Size Limit - Error]: inputs at or above the size limit produce an error. +STRLENBYTES_SIZE_LIMIT_ERROR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "size_at_limit", + value="a" * STRING_SIZE_LIMIT_BYTES, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$strLenBytes should reject a string at the size limit", + ), +] + + +STRLENBYTES_SIZE_LIMIT_TESTS = ( + STRLENBYTES_SIZE_LIMIT_SUCCESS_TESTS + STRLENBYTES_SIZE_LIMIT_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_SIZE_LIMIT_TESTS)) +def test_strlenbytes_cases(collection, test_case: StrLenBytesTest): + """Test $strLenBytes size limit cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py new file mode 100644 index 00000000..0b69ee34 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRLENBYTES_TYPE_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_NAN, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_ONE_AND_HALF, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + +# Property [Type Strictness]: any non-string argument produces an error. +STRLENBYTES_TYPE_ERROR_TESTS: list[StrLenBytesTest] = [ + StrLenBytesTest( + "type_int", + value=42, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject int", + ), + StrLenBytesTest( + "type_long", + value=Int64(42), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Int64", + ), + StrLenBytesTest( + "type_double", + value=3.14, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject double", + ), + StrLenBytesTest( + "type_decimal", + value=DECIMAL128_ONE_AND_HALF, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Decimal128", + ), + StrLenBytesTest( + "type_bool", + value=True, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject boolean", + ), + StrLenBytesTest( + "type_date", + value=datetime(2024, 1, 1, tzinfo=timezone.utc), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject datetime", + ), + StrLenBytesTest( + "type_regex", + value=Regex("abc"), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject regex", + ), + StrLenBytesTest( + "type_objectid", + value=ObjectId("507f1f77bcf86cd799439011"), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject ObjectId", + ), + StrLenBytesTest( + "type_object", + value={"a": 1}, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject object", + ), + StrLenBytesTest( + "type_binary", + value=Binary(b"data"), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject binary", + ), + StrLenBytesTest( + "type_maxkey", + value=MaxKey(), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject MaxKey", + ), + StrLenBytesTest( + "type_minkey", + value=MinKey(), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject MinKey", + ), + StrLenBytesTest( + "type_timestamp", + value=Timestamp(1, 1), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Timestamp", + ), + StrLenBytesTest( + "type_code", + value=Code("function() {}"), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Code", + ), + StrLenBytesTest( + "type_code_with_scope", + value=Code("function() {}", {"x": 1}), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Code with scope", + ), + StrLenBytesTest( + "type_binary_uuid", + value=Binary(b"data", 4), + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject binary UUID", + ), + # Special float values. + StrLenBytesTest( + "type_nan", + value=FLOAT_NAN, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject NaN", + ), + StrLenBytesTest( + "type_inf", + value=FLOAT_INFINITY, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Infinity", + ), + StrLenBytesTest( + "type_neg_inf", + value=FLOAT_NEGATIVE_INFINITY, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject -Infinity", + ), + StrLenBytesTest( + "type_decimal_nan", + value=DECIMAL128_NAN, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Decimal128 NaN", + ), + StrLenBytesTest( + "type_decimal_inf", + value=DECIMAL128_INFINITY, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Decimal128 Infinity", + ), + StrLenBytesTest( + "type_decimal_neg_inf", + value=DECIMAL128_NEGATIVE_INFINITY, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject Decimal128 -Infinity", + ), + # Expression returning non-string type. + StrLenBytesTest( + "type_expr_returns_int", + value={"$add": [1, 2]}, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject expression resolving to int", + ), + # An expression returning an array fails the type check, unlike a literal array which is + # parsed as an argument list. + StrLenBytesTest( + "type_expr_returns_array", + value={"$split": ["hello", "-"]}, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject expression resolving to array", + ), + # Runtime array via $literal produces type error, not arity error. + StrLenBytesTest( + "type_runtime_array", + value={"$literal": ["hello"]}, + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject runtime array via $literal", + ), + # Array syntax with invalid types: single-element array is unwrapped, then type-checked. + StrLenBytesTest( + "type_array_int", + value=[42], + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject single-element array containing int", + ), + StrLenBytesTest( + "type_array_bool", + value=[True], + error_code=STRLENBYTES_TYPE_ERROR, + msg="$strLenBytes should reject single-element array containing boolean", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENBYTES_TYPE_ERROR_TESTS)) +def test_strlenbytes_cases(collection, test_case: StrLenBytesTest): + """Test $strLenBytes type error cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/strLenBytes_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/strLenBytes_common.py new file mode 100644 index 00000000..433cda22 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/utils/strLenBytes_common.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, cast + +from documentdb_tests.framework.test_case import BaseTestCase + + +@dataclass(frozen=True) +class StrLenBytesTest(BaseTestCase): + """Test case for $strLenBytes operator.""" + + value: Any = None + expr: Any = None # Raw expression override + + +def _expr(test_case: StrLenBytesTest) -> dict[str, Any]: + if test_case.expr is not None: + return cast(dict[str, Any], test_case.expr) + return {"$strLenBytes": test_case.value} diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py new file mode 100644 index 00000000..acf0d56c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py @@ -0,0 +1,299 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) + +# Property [Core Behavior]: returns the number of UTF-8 code points in the input string. +STRLENCP_CORE_TESTS: list[StrLenCPTest] = [ + StrLenCPTest("core_empty", value="", expected=0, msg="$strLenCP of empty string should be 0"), + StrLenCPTest("core_space", value=" ", expected=1, msg="$strLenCP of space should be 1"), + StrLenCPTest( + "core_multiple_spaces", value=" ", expected=3, msg="$strLenCP of three spaces should be 3" + ), + StrLenCPTest( + "core_ascii_word", + value="hello", + expected=5, + msg="$strLenCP of ASCII word should equal character count", + ), + StrLenCPTest("core_newline", value="\n", expected=1, msg="$strLenCP of newline should be 1"), + StrLenCPTest("core_tab", value="\t", expected=1, msg="$strLenCP of tab should be 1"), + StrLenCPTest("core_cr", value="\r", expected=1, msg="$strLenCP of carriage return should be 1"), + StrLenCPTest( + "core_null_byte", value="\x00", expected=1, msg="$strLenCP of null byte should be 1" + ), + StrLenCPTest("core_crlf", value="\r\n", expected=2, msg="$strLenCP of CRLF should be 2"), + # 2-byte UTF-8: Latin extended, Greek. + StrLenCPTest( + "core_2byte_e_acute", + value="é", + expected=1, + msg="$strLenCP of 2-byte é should be 1 code point", + ), + StrLenCPTest( + "core_2byte_n_tilde", + value="ñ", + expected=1, + msg="$strLenCP of 2-byte ñ should be 1 code point", + ), + StrLenCPTest( + "core_2byte_greek", + value="λ", + expected=1, + msg="$strLenCP of 2-byte Greek lambda should be 1 code point", + ), + # 3-byte UTF-8: CJK, Euro sign, BOM, ZWJ. + StrLenCPTest( + "core_3byte_cjk", + value="寿", + expected=1, + msg="$strLenCP of 3-byte CJK character should be 1 code point", + ), + StrLenCPTest( + "core_3byte_euro", + value="€", + expected=1, + msg="$strLenCP of 3-byte Euro sign should be 1 code point", + ), + StrLenCPTest( + "core_3byte_bom", + value="\ufeff", + expected=1, + msg="$strLenCP of 3-byte BOM should be 1 code point", + ), + StrLenCPTest( + "core_3byte_zwj", + value="\u200d", + expected=1, + msg="$strLenCP of 3-byte ZWJ should be 1 code point", + ), + StrLenCPTest( + "core_3byte_zwsp", + value="\u200b", + expected=1, + msg="$strLenCP of 3-byte zero-width space should be 1 code point", + ), + # 4-byte UTF-8: emoji, math symbols. + StrLenCPTest( + "core_4byte_emoji", + value="😀", + expected=1, + msg="$strLenCP of 4-byte emoji should be 1 code point", + ), + StrLenCPTest( + "core_4byte_math", + value="𝜋", + expected=1, + msg="$strLenCP of 4-byte math symbol should be 1 code point", + ), + StrLenCPTest( + "core_4byte_party", + value="🎉", + expected=1, + msg="$strLenCP of 4-byte party emoji should be 1 code point", + ), + # Mixed byte widths. + StrLenCPTest( + "core_mixed_all_widths", + value="aé€😀", + expected=4, + msg="$strLenCP should count each character as 1 code point regardless of byte width", + ), + StrLenCPTest( + "core_mixed_latin", + value="cafétéria", + expected=9, + msg="$strLenCP should count mixed ASCII and accented chars as individual code points", + ), + StrLenCPTest( + "core_mixed_spanish", + value="jalapeño", + expected=8, + msg="$strLenCP should count ñ as 1 code point", + ), + # Precomposed U+00E9 (1 code point) vs decomposed U+0065 + U+0301 (2 code points). + StrLenCPTest( + "core_precomposed", + value="\u00e9", + expected=1, + msg="$strLenCP of precomposed é should be 1 code point", + ), + StrLenCPTest( + "core_decomposed", + value="e\u0301", + expected=2, + msg="$strLenCP of decomposed é (e + combining accent) should be 2 code points", + ), + # ZWJ emoji sequence: 3 emoji + 2 ZWJ (U+200D) = 5 code points. + StrLenCPTest( + "core_zwj_emoji", + value="👨\u200d👩\u200d👧", + expected=5, + msg="$strLenCP of ZWJ emoji sequence should count each emoji and joiner separately", + ), + # Mixed scripts. + StrLenCPTest( + "core_accent_word", + value="café", + expected=4, + msg="$strLenCP of 'café' should be 4 code points", + ), + StrLenCPTest( + "core_cjk_word", value="寿司", expected=2, msg="$strLenCP of two CJK characters should be 2" + ), + StrLenCPTest( + "core_mixed_scripts", + value="hello 世界", + expected=8, + msg="$strLenCP should count ASCII and CJK as 1 code point each", + ), +] + + +# Property [Encoding and Character Handling]: whitespace variants, control characters, directional +# markers, encoding boundary characters, and locale-sensitive letters are each counted as exactly 1 +# code point. +STRLENCP_ENCODING_TESTS: list[StrLenCPTest] = [ + # Whitespace variants. + StrLenCPTest( + "encoding_nbsp", + value="\u00a0", + expected=1, + msg="$strLenCP of non-breaking space should be 1", + ), + StrLenCPTest( + "encoding_en_space", value="\u2000", expected=1, msg="$strLenCP of en space should be 1" + ), + StrLenCPTest( + "encoding_em_space", value="\u2003", expected=1, msg="$strLenCP of em space should be 1" + ), + # Control characters. + StrLenCPTest( + "encoding_control_soh", + value="\x01", + expected=1, + msg="$strLenCP of SOH control character should be 1", + ), + StrLenCPTest( + "encoding_control_us", + value="\x1f", + expected=1, + msg="$strLenCP of US control character should be 1", + ), + # Directional markers. + StrLenCPTest( + "encoding_ltr_mark", + value="\u200e", + expected=1, + msg="$strLenCP of left-to-right mark should be 1", + ), + StrLenCPTest( + "encoding_rtl_mark", + value="\u200f", + expected=1, + msg="$strLenCP of right-to-left mark should be 1", + ), + # UTF-8 encoding boundary characters. + StrLenCPTest( + "encoding_boundary_d7ff", + value="\ud7ff", + expected=1, + msg="$strLenCP of U+D7FF (last pre-surrogate) should be 1", + ), + StrLenCPTest( + "encoding_boundary_e000", + value="\ue000", + expected=1, + msg="$strLenCP of U+E000 (first PUA) should be 1", + ), + StrLenCPTest( + "encoding_boundary_ffff", + value="\uffff", + expected=1, + msg="$strLenCP of U+FFFF (last BMP) should be 1", + ), + StrLenCPTest( + "encoding_boundary_10000", + value="\U00010000", + expected=1, + msg="$strLenCP of U+10000 (first supplementary) should be 1", + ), + StrLenCPTest( + "encoding_boundary_10ffff", + value="\U0010ffff", + expected=1, + msg="$strLenCP of U+10FFFF (last valid codepoint) should be 1", + ), + # Deseret script. + StrLenCPTest( + "encoding_deseret", + value="\U00010400", + expected=1, + msg="$strLenCP of Deseret character should be 1", + ), + # Locale-sensitive letters. + StrLenCPTest( + "encoding_sharp_s", value="ß", expected=1, msg="$strLenCP of German sharp s should be 1" + ), + StrLenCPTest( + "encoding_fi_ligature", + value="\ufb01", + expected=1, + msg="$strLenCP of fi ligature should be 1", + ), + StrLenCPTest( + "encoding_dotless_i", + value="\u0131", + expected=1, + msg="$strLenCP of Turkish dotless i should be 1", + ), +] + +# Property [Embedded Null Bytes]: null bytes in various positions do not cause early string +# termination. +STRLENCP_NULL_BYTE_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "null_byte_at_start", + value="\x00xyz", + expected=4, + msg="$strLenCP should not terminate early on leading null byte", + ), + StrLenCPTest( + "null_byte_at_end", + value="xyz\x00", + expected=4, + msg="$strLenCP should not terminate early on trailing null byte", + ), + StrLenCPTest( + "null_byte_multiple", + value="a\x00b\x00c\x00", + expected=6, + msg="$strLenCP should count all code points with interleaved null bytes", + ), + StrLenCPTest( + "null_byte_with_multibyte", + value="寿\x00司", + expected=3, + msg="$strLenCP should count all code points with null byte between multibyte chars", + ), +] + +STRLENCP_CODE_POINT_TESTS = STRLENCP_CORE_TESTS + STRLENCP_ENCODING_TESTS + STRLENCP_NULL_BYTE_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_CODE_POINT_TESTS)) +def test_strlencp_cases(collection, test_case: StrLenCPTest): + """Test $strLenCP cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py new file mode 100644 index 00000000..f33ecf2a --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_expression_with_insert, + execute_project_with_insert, +) + +# Property [Expression Arguments]: the argument accepts any expression that resolves to a string. +STRLENCP_EXPR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "expr_concat", + value={"$concat": ["hel", "lo"]}, + expected=5, + msg="$strLenCP should accept $concat expression as argument", + ), + StrLenCPTest( + "expr_toupper", + value={"$toUpper": "hello"}, + expected=5, + msg="$strLenCP should accept $toUpper expression as argument", + ), +] + +# Property [Array Syntax]: a literal single-element array is parsed as one argument. +STRLENCP_ARRAY_SYNTAX_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "expr_array_syntax_ascii", + value=["hello"], + expected=5, + msg="$strLenCP should accept single-element array with ASCII string", + ), + StrLenCPTest( + "expr_array_syntax_multibyte", + value=["café"], + expected=4, + msg="$strLenCP should accept single-element array with multibyte string", + ), +] + + +# Property [JSON/BSON-Meaningful Characters]: strings containing JSON/BSON-meaningful characters are +# treated as data and each character is counted as 1 code point. +STRLENCP_JSON_BSON_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "json_bson_double_quote", value='"', expected=1, msg="$strLenCP of double quote should be 1" + ), + StrLenCPTest( + "json_bson_backslash", value="\\", expected=1, msg="$strLenCP of backslash should be 1" + ), + StrLenCPTest("json_bson_open_brace", value="{", expected=1, msg="$strLenCP of '{' should be 1"), + StrLenCPTest( + "json_bson_close_brace", value="}", expected=1, msg="$strLenCP of '}' should be 1" + ), + StrLenCPTest( + "json_bson_open_bracket", value="[", expected=1, msg="$strLenCP of '[' should be 1" + ), + StrLenCPTest( + "json_bson_close_bracket", value="]", expected=1, msg="$strLenCP of ']' should be 1" + ), + StrLenCPTest("json_bson_colon", value=":", expected=1, msg="$strLenCP of colon should be 1"), + StrLenCPTest("json_bson_comma", value=",", expected=1, msg="$strLenCP of comma should be 1"), + StrLenCPTest( + "json_bson_mixed", + value='{"key": [1, 2]}', + expected=15, + msg="$strLenCP of JSON-like string should count each character as 1", + ), +] + + +# Property [Dollar Sign Literal]: $literal avoids field path interpretation for dollar-prefixed +# strings. +STRLENCP_DOLLAR_LITERAL_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "dollar_literal_hello", + value={"$literal": "$hello"}, + expected=6, + msg="$strLenCP should count dollar-prefixed string via $literal", + ), + StrLenCPTest( + "dollar_literal_bare", + value={"$literal": "$"}, + expected=1, + msg="$strLenCP should count bare dollar via $literal as 1", + ), +] + +STRLENCP_INPUT_FORM_TESTS = ( + STRLENCP_EXPR_TESTS + + STRLENCP_ARRAY_SYNTAX_TESTS + + STRLENCP_JSON_BSON_TESTS + + STRLENCP_DOLLAR_LITERAL_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_INPUT_FORM_TESTS)) +def test_strlencp_cases(collection, test_case: StrLenCPTest): + """Test $strLenCP cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) + + +# Property [Document Field References]: $strLenCP works with values from document fields. +def test_strlencp_document_fields(collection): + """Test $strLenCP reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "café"}, + {"result": {"$strLenCP": "$s"}}, + ) + assertSuccess(result, [{"result": 4}], msg="$strLenCP should read value from document field") + + +# Property [Nested Field Paths]: $strLenCP resolves dotted field paths in nested documents. +def test_strlencp_nested_field_paths(collection): + """Test $strLenCP reads values from nested document field paths.""" + result = execute_expression_with_insert(collection, {"$strLenCP": "$a.b"}, {"a": {"b": "café"}}) + assertSuccess(result, [{"result": 4}], msg="$strLenCP should resolve nested field path") diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py new file mode 100644 index 00000000..194561d2 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_TYPE_MISMATCH_ERROR, + FAILED_TO_PARSE_ERROR, + INVALID_DOLLAR_FIELD_PATH, + STRLENCP_TYPE_ERROR, +) +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import MISSING +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) + +# Property [Null and Missing Errors]: null or missing arguments produce an error. +STRLENCP_NULL_ERROR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "null_literal", + value=None, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject null input", + ), + StrLenCPTest( + "missing_field", + value=MISSING, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject missing field", + ), + StrLenCPTest( + "null_expr", + value={"$literal": None}, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject null from expression", + ), + StrLenCPTest( + "null_array", + value=[None], + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject single-element array containing null", + ), +] + +# Property [Arity Errors]: literal arrays with zero or multiple elements produce an arity error. +STRLENCP_ARITY_ERROR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "arity_empty_array", + value=[], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenCP should reject empty array", + ), + StrLenCPTest( + "arity_two_elements", + value=["a", "b"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenCP should reject two-element array", + ), + StrLenCPTest( + "arity_three_elements", + value=["a", "b", "c"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$strLenCP should reject three-element array", + ), +] + + +# Property [Dollar Sign Error]: a bare "$" is interpreted as a field path and "$$" is interpreted +# as an empty variable name. +STRLENCP_DOLLAR_ERROR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "dollar_bare", + value="$", + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$strLenCP should reject bare '$' as invalid field path", + ), + StrLenCPTest( + "dollar_double", + value="$$", + error_code=FAILED_TO_PARSE_ERROR, + msg="$strLenCP should reject '$$' as empty variable name", + ), +] + +STRLENCP_INVALID_ARG_TESTS = ( + STRLENCP_NULL_ERROR_TESTS + STRLENCP_ARITY_ERROR_TESTS + STRLENCP_DOLLAR_ERROR_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_INVALID_ARG_TESTS)) +def test_strlencp_cases(collection, test_case: StrLenCPTest): + """Test $strLenCP cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py new file mode 100644 index 00000000..40a56168 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertSuccess +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project, +) + +# Property [Return Type]: the result is always an integer when the expression succeeds. +STRLENCP_RETURN_TYPE_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "return_type_ascii", value="hello", msg="$strLenCP of ASCII string should return int" + ), + StrLenCPTest("return_type_empty", value="", msg="$strLenCP of empty string should return int"), + StrLenCPTest( + "return_type_multibyte", value="café", msg="$strLenCP of multibyte string should return int" + ), + StrLenCPTest("return_type_emoji", value="🎉", msg="$strLenCP of emoji should return int"), + StrLenCPTest( + "return_type_expression", + value={"$concat": ["a", "b"]}, + msg="$strLenCP of expression result should return int", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_RETURN_TYPE_TESTS)) +def test_strlencp_return_type(collection, test_case: StrLenCPTest): + """Test $strLenCP result is always type int.""" + result = execute_expression(collection, {"$type": {"$strLenCP": test_case.value}}) + assertSuccess(result, [{"result": "int"}], msg=test_case.msg) + + +# Property [Byte Count Invariant]: the code point count is always <= the byte count. +STRLENCP_INVARIANT_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "invariant_ascii", value="hello", msg="$strLenCP should be <= $strLenBytes for ASCII" + ), + StrLenCPTest( + "invariant_2byte", value="café", msg="$strLenCP should be <= $strLenBytes for 2-byte chars" + ), + StrLenCPTest( + "invariant_3byte", value="寿司", msg="$strLenCP should be <= $strLenBytes for 3-byte chars" + ), + StrLenCPTest( + "invariant_4byte", value="😀🎉", msg="$strLenCP should be <= $strLenBytes for 4-byte chars" + ), + StrLenCPTest( + "invariant_mixed", + value="aé€😀", + msg="$strLenCP should be <= $strLenBytes for mixed byte widths", + ), + StrLenCPTest( + "invariant_empty", value="", msg="$strLenCP should be <= $strLenBytes for empty string" + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_INVARIANT_TESTS)) +def test_strlencp_invariant(collection, test_case: StrLenCPTest): + """Test $strLenCP is always <= $strLenBytes.""" + s = test_case.value + result = execute_project( + collection, + {"cpLteBytes": {"$lte": [{"$strLenCP": s}, {"$strLenBytes": s}]}}, + ) + assertSuccess(result, [{"cpLteBytes": True}], msg=test_case.msg) + + +# Property [Length Additivity]: code point length of a concatenation equals the sum of its parts. +STRLENCP_ADDITIVITY_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "additivity_ascii", + value=["hello", "world"], + msg="$strLenCP of concat should equal sum of parts for ASCII", + ), + StrLenCPTest( + "additivity_2byte", + value=["café", "naïve"], + msg="$strLenCP of concat should equal sum of parts for 2-byte chars", + ), + StrLenCPTest( + "additivity_mixed", + value=["寿司", "🎉"], + msg="$strLenCP of concat should equal sum of parts for mixed byte widths", + ), + StrLenCPTest( + "additivity_empty_left", + value=["", "hello"], + msg="$strLenCP of concat should equal sum of parts with empty left", + ), + StrLenCPTest( + "additivity_empty_right", + value=["hello", ""], + msg="$strLenCP of concat should equal sum of parts with empty right", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_ADDITIVITY_TESTS)) +def test_strlencp_additivity(collection, test_case: StrLenCPTest): + """Test $strLenCP of concatenation equals sum of parts.""" + parts = test_case.value + expected_cp = sum(len(p) for p in parts) + result = execute_project( + collection, + { + "lenConcat": {"$strLenCP": {"$concat": parts}}, + "sumParts": {"$add": [{"$strLenCP": p} for p in parts]}, + }, + ) + assertSuccess(result, [{"lenConcat": expected_cp, "sumParts": expected_cp}], msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py new file mode 100644 index 00000000..3ee667d6 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) + +# Property [String Size Limit - Success]: inputs just under the size limit succeed. +STRLENCP_SIZE_LIMIT_SUCCESS_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "size_one_under", + value="a" * (STRING_SIZE_LIMIT_BYTES - 1), + expected=STRING_SIZE_LIMIT_BYTES - 1, + msg="$strLenCP should handle a string one byte under the size limit", + ), + StrLenCPTest( + "size_one_under_3byte", + value="寿" * ((STRING_SIZE_LIMIT_BYTES - 1) // 3), + expected=(STRING_SIZE_LIMIT_BYTES - 1) // 3, + msg="$strLenCP should count 3-byte chars as code points near the size limit", + ), +] + + +# Property [String Size Limit - Error]: inputs at or above the size limit produce an error. +STRLENCP_SIZE_LIMIT_ERROR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "size_at_limit", + value="a" * STRING_SIZE_LIMIT_BYTES, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$strLenCP should reject a string at the size limit", + ), +] + +STRLENCP_SIZE_LIMIT_TESTS = STRLENCP_SIZE_LIMIT_SUCCESS_TESTS + STRLENCP_SIZE_LIMIT_ERROR_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_SIZE_LIMIT_TESTS)) +def test_strlencp_cases(collection, test_case: StrLenCPTest): + """Test $strLenCP cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py new file mode 100644 index 00000000..da48d1d5 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRLENCP_TYPE_ERROR +from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_NAN, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_ONE_AND_HALF, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) + +# Property [Type Strictness]: any non-string argument produces an error. +STRLENCP_TYPE_ERROR_TESTS: list[StrLenCPTest] = [ + StrLenCPTest( + "type_int", value=42, error_code=STRLENCP_TYPE_ERROR, msg="$strLenCP should reject int" + ), + StrLenCPTest( + "type_long", + value=Int64(42), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Int64", + ), + StrLenCPTest( + "type_double", + value=3.14, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject double", + ), + StrLenCPTest( + "type_decimal", + value=DECIMAL128_ONE_AND_HALF, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Decimal128", + ), + StrLenCPTest( + "type_bool", + value=True, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject boolean", + ), + StrLenCPTest( + "type_date", + value=datetime(2024, 1, 1, tzinfo=timezone.utc), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject datetime", + ), + StrLenCPTest( + "type_regex", + value=Regex("abc"), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject regex", + ), + StrLenCPTest( + "type_objectid", + value=ObjectId("507f1f77bcf86cd799439011"), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject ObjectId", + ), + StrLenCPTest( + "type_object", + value={"a": 1}, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject object", + ), + StrLenCPTest( + "type_binary", + value=Binary(b"data"), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject binary", + ), + StrLenCPTest( + "type_binary_uuid", + value=Binary(b"data", 4), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject binary UUID", + ), + StrLenCPTest( + "type_maxkey", + value=MaxKey(), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject MaxKey", + ), + StrLenCPTest( + "type_minkey", + value=MinKey(), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject MinKey", + ), + StrLenCPTest( + "type_timestamp", + value=Timestamp(1, 1), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Timestamp", + ), + StrLenCPTest( + "type_code", + value=Code("function() {}"), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Code", + ), + StrLenCPTest( + "type_code_with_scope", + value=Code("function() {}", {"x": 1}), + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Code with scope", + ), + # Special float values. + StrLenCPTest( + "type_nan", + value=FLOAT_NAN, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject NaN", + ), + StrLenCPTest( + "type_inf", + value=FLOAT_INFINITY, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Infinity", + ), + StrLenCPTest( + "type_neg_inf", + value=FLOAT_NEGATIVE_INFINITY, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject -Infinity", + ), + StrLenCPTest( + "type_decimal_nan", + value=DECIMAL128_NAN, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Decimal128 NaN", + ), + StrLenCPTest( + "type_decimal_inf", + value=DECIMAL128_INFINITY, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Decimal128 Infinity", + ), + StrLenCPTest( + "type_decimal_neg_inf", + value=DECIMAL128_NEGATIVE_INFINITY, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject Decimal128 -Infinity", + ), + # Expression returning non-string type. + StrLenCPTest( + "type_expr_returns_int", + value={"$add": [1, 2]}, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject expression resolving to int", + ), + # An expression returning an array fails the type check, unlike a literal array + # which is parsed as an argument list. + StrLenCPTest( + "type_expr_returns_array", + value={"$split": ["hello", "-"]}, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject expression resolving to array", + ), + # Runtime array via $literal produces type error, not arity error. + StrLenCPTest( + "type_runtime_array", + value={"$literal": ["hello"]}, + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject runtime array via $literal", + ), + # Array syntax with invalid types: single-element array is unwrapped, then type-checked. + StrLenCPTest( + "type_array_int", + value=[42], + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject single-element array containing int", + ), + StrLenCPTest( + "type_array_bool", + value=[True], + error_code=STRLENCP_TYPE_ERROR, + msg="$strLenCP should reject single-element array containing boolean", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(STRLENCP_TYPE_ERROR_TESTS)) +def test_strlencp_cases(collection, test_case: StrLenCPTest): + """Test $strLenCP cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, expected=test_case.expected, error_code=test_case.error_code, msg=test_case.msg + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/strLenCP_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/strLenCP_common.py new file mode 100644 index 00000000..26c46c71 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/utils/strLenCP_common.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, cast + +from documentdb_tests.framework.test_case import BaseTestCase + + +@dataclass(frozen=True) +class StrLenCPTest(BaseTestCase): + """Test case for $strLenCP operator.""" + + value: Any = None + expr: Any = None # Raw expression override + + +def _expr(test_case: StrLenCPTest) -> dict[str, Any]: + if test_case.expr is not None: + return cast(dict[str, Any], test_case.expr) + return {"$strLenCP": test_case.value} diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py new file mode 100644 index 00000000..51d55e0f --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py @@ -0,0 +1,255 @@ +""" +Integration tests for string expression operators interacting with each other. + +These tests verify that composing multiple string operators produces correct +results. Individual operator edge cases are tested in each operator's own +folder; these tests focus on cross-operator interactions where behavioral +differences between engines are most likely to surface. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import pytest + +from documentdb_tests.framework.assertions import assertSuccess +from documentdb_tests.framework.test_case import BaseTestCase, pytest_params +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression + + +@dataclass(frozen=True) +class ExprTest(BaseTestCase): + """Test case with an inline expression.""" + + expr: Any = None + + +# Property [Concat-Split Round-Trip]: splitting a $concat-joined string recovers the original parts. +CONCAT_SPLIT_TESTS: list[ExprTest] = [ + ExprTest( + "concat_split_simple", + expr={"$split": [{"$concat": ["a", "-", "b", "-", "c"]}, "-"]}, + expected=["a", "b", "c"], + msg="$split of $concat-joined parts should recover original parts", + ), + ExprTest( + "concat_split_empty_parts", + expr={"$split": [{"$concat": ["", "|", "a", "|", ""]}, "|"]}, + expected=["", "a", ""], + msg="$split of $concat with empty parts should preserve empties", + ), + ExprTest( + "concat_split_multibyte_delimiter", + expr={"$split": [{"$concat": ["hello", "\u2192", "world"]}, "\u2192"]}, + expected=["hello", "world"], + msg="$split of $concat with multi-byte delimiter should round-trip", + ), +] + +# Property [Case Chaining]: applying $toLower and $toUpper in sequence produces deterministic case. +CASE_CHAIN_TESTS: list[ExprTest] = [ + ExprTest( + "lower_of_upper", + expr={"$toLower": {"$toUpper": "Hello World"}}, + expected="hello world", + msg="$toLower of $toUpper should produce lowercase", + ), + ExprTest( + "upper_of_lower", + expr={"$toUpper": {"$toLower": "Hello World"}}, + expected="HELLO WORLD", + msg="$toUpper of $toLower should produce uppercase", + ), +] + +# Property [Trim-Concat Boundary]: $concat of $trim result does not reintroduce whitespace. +TRIM_CONCAT_TESTS: list[ExprTest] = [ + ExprTest( + "trim_then_concat", + expr={ + "$concat": [ + {"$trim": {"input": " hello "}}, + " ", + {"$trim": {"input": " world "}}, + ] + }, + expected="hello world", + msg="$concat of trimmed strings should join without extra whitespace", + ), +] + +# Property [Find-Extract]: $substrCP at the position returned by $indexOfCP extracts the searched +# substring. +FIND_EXTRACT_TESTS: list[ExprTest] = [ + ExprTest( + "find_extract_ascii", + expr={ + "$substrCP": [ + "hello world", + {"$indexOfCP": ["hello world", "world"]}, + {"$strLenCP": "world"}, + ] + }, + expected="world", + msg="$substrCP at $indexOfCP position should extract the searched substring", + ), + ExprTest( + "find_extract_multibyte", + expr={ + "$substrCP": [ + "caf\u00e9 latte", + {"$indexOfCP": ["caf\u00e9 latte", "latte"]}, + {"$strLenCP": "latte"}, + ] + }, + expected="latte", + msg="$substrCP + $indexOfCP should work correctly with multi-byte prefix", + ), + ExprTest( + "find_extract_emoji", + expr={ + "$substrCP": [ + "hi \U0001f600 there", + {"$indexOfCP": ["hi \U0001f600 there", "there"]}, + {"$strLenCP": "there"}, + ] + }, + expected="there", + msg="$substrCP + $indexOfCP should work correctly with 4-byte emoji prefix", + ), +] + +# Property [Replace-Case Ordering]: the order of $replaceAll and $toLower affects the result. +REPLACE_CASE_TESTS: list[ExprTest] = [ + ExprTest( + "replace_then_lower", + expr={ + "$toLower": { + "$replaceAll": { + "input": "Hello World", + "find": "World", + "replacement": "EARTH", + } + } + }, + expected="hello earth", + msg="$toLower of $replaceAll should lowercase the replaced text", + ), + ExprTest( + "lower_then_replace", + expr={ + "$replaceAll": { + "input": {"$toLower": "Hello World"}, + "find": "world", + "replacement": "earth", + } + }, + expected="hello earth", + msg="$replaceAll after $toLower should match against lowercase input", + ), +] + +# Property [Strcasecmp Case-Converted]: $strcasecmp returns 0 for $toLower vs $toUpper of the same +# ASCII string. +STRCASECMP_TESTS: list[ExprTest] = [ + ExprTest( + "strcasecmp_lower_vs_upper", + expr={"$strcasecmp": [{"$toLower": "Hello World"}, {"$toUpper": "Hello World"}]}, + expected=0, + msg="$strcasecmp of $toLower and $toUpper of same string should be 0", + ), +] + + +# Property [Length Divergence]: replacing ASCII with multi-byte chars preserves $strLenCP but +# increases $strLenBytes. +LENGTH_DIVERGENCE_TESTS: list[ExprTest] = [ + ExprTest( + "replace_multibyte_cp_len", + expr={ + "$strLenCP": {"$replaceAll": {"input": "hello", "find": "e", "replacement": "\u00e9"}} + }, + expected=5, + msg="$strLenCP should be unchanged after replacing ASCII with multi-byte char", + ), + ExprTest( + "replace_multibyte_byte_len", + expr={ + "$strLenBytes": { + "$replaceAll": {"input": "hello", "find": "e", "replacement": "\u00e9"} + } + }, + expected=6, + msg="$strLenBytes should increase after replacing ASCII with multi-byte char", + ), +] + +# Property [Trim Equivalence]: $trim(input, chars) == $ltrim($rtrim(input, chars), chars). +TRIM_EQUIVALENCE_TESTS: list[ExprTest] = [ + ExprTest( + "trim_eq_ltrim_rtrim_default", + expr={"$eq": [ + {"$trim": {"input": " hello "}}, + {"$ltrim": {"input": {"$rtrim": {"input": " hello "}}}}, + ]}, + expected=True, + msg="$trim should equal $ltrim($rtrim(x)) with default whitespace", + ), + ExprTest( + "trim_eq_ltrim_rtrim_custom", + expr={"$eq": [ + {"$trim": {"input": "aaahelloaaa", "chars": "a"}}, + {"$ltrim": {"input": {"$rtrim": {"input": "aaahelloaaa", "chars": "a"}}, "chars": "a"}}, + ]}, + expected=True, + msg="$trim should equal $ltrim($rtrim(x)) with custom chars", + ), + ExprTest( + "trim_eq_ltrim_rtrim_mixed_ws", + expr={"$eq": [ + {"$trim": {"input": " \t\nhello\n\t "}}, + {"$ltrim": {"input": {"$rtrim": {"input": " \t\nhello\n\t "}}}}, + ]}, + expected=True, + msg="$trim should equal $ltrim($rtrim(x)) with mixed whitespace", + ), +] + +# Property [Concat Length From Expressions]: $strLenCP of $concat equals expected length when inputs +# are expression results. +CONCAT_LENGTH_TESTS: list[ExprTest] = [ + ExprTest( + "concat_length_from_expressions", + expr={ + "$strLenCP": { + "$concat": [ + {"$toUpper": "caf\u00e9"}, + {"$replaceAll": {"input": "hello", "find": "l", "replacement": "LL"}}, + ] + } + }, + expected=11, + msg="$strLenCP of $concat of expression results should equal expected length", + ), +] + +STRING_COMBINATION_TESTS = ( + CONCAT_SPLIT_TESTS + + CASE_CHAIN_TESTS + + TRIM_CONCAT_TESTS + + FIND_EXTRACT_TESTS + + REPLACE_CASE_TESTS + + STRCASECMP_TESTS + + LENGTH_DIVERGENCE_TESTS + + CONCAT_LENGTH_TESTS + + TRIM_EQUIVALENCE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(STRING_COMBINATION_TESTS)) +def test_string_combination(collection, test_case: ExprTest): + """Test string operator combinations.""" + result = execute_expression(collection, test_case.expr) + assertSuccess(result, [{"result": test_case.expected}], msg=test_case.msg) From f23b5832e74f8465827ce91e5662da2cd5caf90d Mon Sep 17 00:00:00 2001 From: Yunxuan Shi Date: Thu, 9 Apr 2026 17:09:47 -0700 Subject: [PATCH 2/2] Add missing dependencies for $indexOf*/$strLen* and integration tests - Add __init__.py for package resolution - Add INDEXOF*, STRLEN*, EXPRESSION_ARITY_ERROR, FAILED_TO_PARSE_ERROR, INVALID_DOLLAR_FIELD_PATH, BSON_TO_STRING_CONVERSION_ERROR, STRING_SIZE_LIMIT_ERROR to error_codes.py - Add STRING_SIZE_LIMIT_BYTES to test_constants.py - Fix pytest_params import (parametrize module) - Use relative imports for operator common utils - Pin CI MongoDB to 8.2.4 - Run isort/black formatting Signed-off-by: Yunxuan Shi --- .github/workflows/pr-tests.yml | 2 +- .../compatibility/tests/core/__init__.py | 0 .../tests/core/operator/__init__.py | 0 .../core/operator/expressions/__init__.py | 0 .../operator/expressions/string/__init__.py | 0 .../string/indexOfBytes/__init__.py | 0 .../test_indexOfBytes_encoding.py | 9 ++-- .../test_indexOfBytes_index_types.py | 14 +++++-- .../test_indexOfBytes_invalid_args.py | 9 ++-- .../indexOfBytes/test_indexOfBytes_null.py | 9 ++-- .../indexOfBytes/test_indexOfBytes_search.py | 9 ++-- .../test_indexOfBytes_size_limit.py | 9 ++-- .../test_indexOfBytes_type_errors.py | 9 ++-- .../indexOfBytes/test_indexOfBytes_usage.py | 13 +++--- .../expressions/string/indexOfCP/__init__.py | 0 .../indexOfCP/test_indexOfCP_encoding.py | 9 ++-- .../indexOfCP/test_indexOfCP_index_types.py | 14 +++++-- .../indexOfCP/test_indexOfCP_invalid_args.py | 9 ++-- .../string/indexOfCP/test_indexOfCP_null.py | 9 ++-- .../string/indexOfCP/test_indexOfCP_search.py | 9 ++-- .../indexOfCP/test_indexOfCP_size_limit.py | 9 ++-- .../indexOfCP/test_indexOfCP_type_errors.py | 9 ++-- .../string/indexOfCP/test_indexOfCP_usage.py | 13 +++--- .../string/strLenBytes/__init__.py | 0 .../test_strLenBytes_byte_counts.py | 11 ++--- .../test_strLenBytes_input_forms.py | 13 +++--- .../test_strLenBytes_invalid_args.py | 9 ++-- .../test_strLenBytes_invariants.py | 11 ++--- .../test_strLenBytes_size_limit.py | 9 ++-- .../test_strLenBytes_type_errors.py | 9 ++-- .../expressions/string/strLenCP/__init__.py | 0 .../strLenCP/test_strLenCP_code_points.py | 11 ++--- .../strLenCP/test_strLenCP_input_forms.py | 13 +++--- .../strLenCP/test_strLenCP_invalid_args.py | 11 ++--- .../strLenCP/test_strLenCP_invariants.py | 11 ++--- .../strLenCP/test_strLenCP_size_limit.py | 11 ++--- .../strLenCP/test_strLenCP_type_errors.py | 11 ++--- .../string/test_string_combination.py | 42 ++++++++++++------- documentdb_tests/framework/error_codes.py | 13 ++++++ documentdb_tests/framework/test_constants.py | 1 + 40 files changed, 223 insertions(+), 127 deletions(-) create mode 100644 documentdb_tests/compatibility/tests/core/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/__init__.py diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index ccbf21ac..37026f58 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -11,7 +11,7 @@ jobs: services: mongodb: - image: mongo:8.2 + image: mongo:8.2.4 ports: - 27017:27017 options: >- diff --git a/documentdb_tests/compatibility/tests/core/__init__.py b/documentdb_tests/compatibility/tests/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/__init__.py b/documentdb_tests/compatibility/tests/core/operator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py index 96891a5c..71823add 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_encoding.py @@ -2,12 +2,15 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Encoding]: the operator matches and indexes by raw UTF-8 byte sequences without # Unicode normalization, and the result is a byte index rather than a codepoint index. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py index fa4a85fe..585bd881 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_index_types.py @@ -3,9 +3,15 @@ import pytest from bson import Decimal128, Int64 +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.error_codes import INDEXOF_INDEX_TYPE_ERROR, INDEXOF_NEGATIVE_INDEX_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.error_codes import ( + INDEXOF_INDEX_TYPE_ERROR, + INDEXOF_NEGATIVE_INDEX_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_INFINITY, DECIMAL128_NEGATIVE_INFINITY, @@ -20,10 +26,10 @@ INT32_UNDERFLOW, INT64_ZERO, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Index Type Acceptance]: integral Decimal128, Int64, and whole-number floats are accepted # for start and end. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py index bc935678..8c31470f 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_invalid_args.py @@ -6,6 +6,9 @@ from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp from bson.code import Code +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( EXPRESSION_ARITY_ERROR, @@ -15,12 +18,12 @@ INDEXOFBYTES_SUBSTRING_TYPE_ERROR, INVALID_DOLLAR_FIELD_PATH, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import DECIMAL128_ONE_AND_HALF -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Arity]: fewer than 2 or more than 4 arguments produces an error. INDEXOFBYTES_ARITY_TESTS: list[IndexOfBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py index a2cb1a5b..3c6cff8c 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_null.py @@ -2,14 +2,17 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import INDEXOFBYTES_SUBSTRING_TYPE_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import MISSING -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Argument shapes for null/missing first-arg tests. _PLACEHOLDER is replaced with None or MISSING. _PLACEHOLDER = object() diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py index f90d45c2..9bb5136a 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_search.py @@ -2,12 +2,15 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [First Occurrence]: when the substring appears multiple times, the result is the byte # index of the first occurrence. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py index 2de31dde..0206f24f 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_size_limit.py @@ -2,14 +2,17 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [String Size Limit Success]: a string or substring argument one byte under the limit # is accepted. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py index 76b17dfc..8f808d0e 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_type_errors.py @@ -6,18 +6,21 @@ from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp from bson.code import Code +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( INDEXOF_INDEX_TYPE_ERROR, INDEXOFBYTES_STRING_TYPE_ERROR, INDEXOFBYTES_SUBSTRING_TYPE_ERROR, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import DECIMAL128_ONE_AND_HALF, MISSING -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( + +from .utils.indexOfBytes_common import ( IndexOfBytesTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Type Strictness]: arguments of incorrect type produce an error. INDEXOFBYTES_TYPE_ERROR_TESTS: list[IndexOfBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py index 401226d1..18c0bf07 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfBytes/test_indexOfBytes_usage.py @@ -2,16 +2,17 @@ import pytest -from documentdb_tests.framework.assertions import assertResult, assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfBytes.utils.indexOfBytes_common import ( - IndexOfBytesTest, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_project_with_insert, ) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES + +from .utils.indexOfBytes_common import ( + IndexOfBytesTest, +) # Property [Expression Arguments]: all argument positions accept expressions that resolve to the # expected type. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py index 0d2ccf67..b1363e1b 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_encoding.py @@ -2,12 +2,15 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Code Point Indexing]: the result is a UTF-8 code point index, not a byte index. INDEXOFCP_CODEPOINT_TESTS: list[IndexOfCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py index 4a023539..bc45bf4f 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_index_types.py @@ -3,9 +3,15 @@ import pytest from bson import Decimal128, Int64 +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.error_codes import INDEXOF_INDEX_TYPE_ERROR, INDEXOF_NEGATIVE_INDEX_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.error_codes import ( + INDEXOF_INDEX_TYPE_ERROR, + INDEXOF_NEGATIVE_INDEX_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_INFINITY, DECIMAL128_NAN, @@ -21,10 +27,10 @@ INT32_UNDERFLOW, INT64_ZERO, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Index Type Acceptance]: integral Decimal128, Int64, and whole-number floats are accepted # for start and end. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py index 3b494cce..26c0ca70 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_invalid_args.py @@ -6,6 +6,9 @@ from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp from bson.code import Code +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( EXPRESSION_ARITY_ERROR, @@ -15,14 +18,14 @@ INDEXOFCP_SUBSTRING_TYPE_ERROR, INVALID_DOLLAR_FIELD_PATH, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_ONE_AND_HALF, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Arity]: fewer than 2 or more than 4 arguments produces an error. INDEXOFCP_ARITY_TESTS: list[IndexOfCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py index 8ff62b52..568cb924 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_null.py @@ -2,14 +2,17 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import INDEXOFCP_SUBSTRING_TYPE_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import MISSING -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Argument shapes for null/missing first-arg tests. _PLACEHOLDER is replaced with None or MISSING. _PLACEHOLDER = object() diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py index 53248c7c..384b7560 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_search.py @@ -2,12 +2,15 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [First Occurrence]: when the substring appears multiple times, the result is the code # point index of the first occurrence. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py index 7c983b5c..a7db7e74 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_size_limit.py @@ -2,14 +2,17 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [String Size Limit - Success]: string and substring arguments just under the size limit # are accepted. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py index 5c0fb7ed..5037ac23 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_type_errors.py @@ -6,21 +6,24 @@ from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp from bson.code import Code +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( INDEXOF_INDEX_TYPE_ERROR, INDEXOFCP_STRING_TYPE_ERROR, INDEXOFCP_SUBSTRING_TYPE_ERROR, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_ONE_AND_HALF, MISSING, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( + +from .utils.indexOfCP_common import ( IndexOfCPTest, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Type Strictness]: arguments of incorrect type produce an error. INDEXOFCP_TYPE_ERROR_TESTS: list[IndexOfCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py index 214d58dc..27077014 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/indexOfCP/test_indexOfCP_usage.py @@ -2,16 +2,17 @@ import pytest -from documentdb_tests.framework.assertions import assertResult, assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.indexOfCP.utils.indexOfCP_common import ( - IndexOfCPTest, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_project_with_insert, ) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES + +from .utils.indexOfCP_common import ( + IndexOfCPTest, +) # Property [Expression Arguments]: all argument positions accept expressions that resolve to the # expected type. diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py index ed5368ac..a5cf5fb3 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_byte_counts.py @@ -2,15 +2,16 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenBytes_common import ( StrLenBytesTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( - execute_expression, -) # Property [Core Behavior]: returns the number of UTF-8 encoded bytes in the input string. STRLENBYTES_CORE_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py index fc281569..80137bbb 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_input_forms.py @@ -2,17 +2,18 @@ import pytest -from documentdb_tests.framework.assertions import assertResult, assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( - StrLenBytesTest, - _expr, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_expression_with_insert, execute_project_with_insert, ) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenBytes_common import ( + StrLenBytesTest, + _expr, +) # Property [Expression Arguments]: the argument accepts any expression that resolves to a string. STRLENBYTES_EXPR_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py index d9d19be1..d981a9ad 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invalid_args.py @@ -2,6 +2,9 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( EXPRESSION_TYPE_MISMATCH_ERROR, @@ -9,13 +12,13 @@ INVALID_DOLLAR_FIELD_PATH, STRLENBYTES_TYPE_ERROR, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import MISSING -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + +from .utils.strLenBytes_common import ( StrLenBytesTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Null and Missing Errors]: null or missing arguments produce an error. STRLENBYTES_NULL_ERROR_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py index 158dbba9..b68d112b 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_invariants.py @@ -2,15 +2,16 @@ import pytest -from documentdb_tests.framework.assertions import assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( - StrLenBytesTest, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_project, ) +from documentdb_tests.framework.assertions import assertSuccess +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenBytes_common import ( + StrLenBytesTest, +) # Property [Return Type]: the result is always an integer when the expression succeeds. STRLENBYTES_RETURN_TYPE_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py index 7cacc2bf..9ae5bbde 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_size_limit.py @@ -2,15 +2,18 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + +from .utils.strLenBytes_common import ( StrLenBytesTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [String Size Limit - Success]: inputs just under the size limit succeed. STRLENBYTES_SIZE_LIMIT_SUCCESS_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py index 0b69ee34..7f3689d1 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenBytes/test_strLenBytes_type_errors.py @@ -6,9 +6,12 @@ from bson import Binary, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp from bson.code import Code +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRLENBYTES_TYPE_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_INFINITY, DECIMAL128_NAN, @@ -18,11 +21,11 @@ FLOAT_NAN, FLOAT_NEGATIVE_INFINITY, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenBytes.utils.strLenBytes_common import ( + +from .utils.strLenBytes_common import ( StrLenBytesTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression # Property [Type Strictness]: any non-string argument produces an error. STRLENBYTES_TYPE_ERROR_TESTS: list[StrLenBytesTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py index acf0d56c..2303642d 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_code_points.py @@ -2,15 +2,16 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenCP_common import ( StrLenCPTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( - execute_expression, -) # Property [Core Behavior]: returns the number of UTF-8 code points in the input string. STRLENCP_CORE_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py index f33ecf2a..aa3c3181 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_input_forms.py @@ -2,17 +2,18 @@ import pytest -from documentdb_tests.framework.assertions import assertResult, assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( - StrLenCPTest, - _expr, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_expression_with_insert, execute_project_with_insert, ) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenCP_common import ( + StrLenCPTest, + _expr, +) # Property [Expression Arguments]: the argument accepts any expression that resolves to a string. STRLENCP_EXPR_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py index 194561d2..401c10d6 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invalid_args.py @@ -2,6 +2,9 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import ( EXPRESSION_TYPE_MISMATCH_ERROR, @@ -9,15 +12,13 @@ INVALID_DOLLAR_FIELD_PATH, STRLENCP_TYPE_ERROR, ) -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import MISSING -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + +from .utils.strLenCP_common import ( StrLenCPTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( - execute_expression, -) # Property [Null and Missing Errors]: null or missing arguments produce an error. STRLENCP_NULL_ERROR_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py index 40a56168..360798f3 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_invariants.py @@ -2,15 +2,16 @@ import pytest -from documentdb_tests.framework.assertions import assertSuccess -from documentdb_tests.framework.test_case import pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( - StrLenCPTest, -) from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( execute_expression, execute_project, ) +from documentdb_tests.framework.assertions import assertSuccess +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.strLenCP_common import ( + StrLenCPTest, +) # Property [Return Type]: the result is always an integer when the expression succeeds. STRLENCP_RETURN_TYPE_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py index 3ee667d6..47a3732c 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_size_limit.py @@ -2,17 +2,18 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + +from .utils.strLenCP_common import ( StrLenCPTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( - execute_expression, -) # Property [String Size Limit - Success]: inputs just under the size limit succeed. STRLENCP_SIZE_LIMIT_SUCCESS_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py index da48d1d5..f33e5689 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/strLenCP/test_strLenCP_type_errors.py @@ -5,9 +5,12 @@ import pytest from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertResult from documentdb_tests.framework.error_codes import STRLENCP_TYPE_ERROR -from documentdb_tests.framework.test_case import pytest_params +from documentdb_tests.framework.parametrize import pytest_params from documentdb_tests.framework.test_constants import ( DECIMAL128_INFINITY, DECIMAL128_NAN, @@ -17,13 +20,11 @@ FLOAT_NAN, FLOAT_NEGATIVE_INFINITY, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.string.strLenCP.utils.strLenCP_common import ( + +from .utils.strLenCP_common import ( StrLenCPTest, _expr, ) -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( - execute_expression, -) # Property [Type Strictness]: any non-string argument produces an error. STRLENCP_TYPE_ERROR_TESTS: list[StrLenCPTest] = [ diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py index 51d55e0f..f31c0b4e 100644 --- a/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/test_string_combination.py @@ -14,9 +14,12 @@ import pytest +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) from documentdb_tests.framework.assertions import assertSuccess -from documentdb_tests.framework.test_case import BaseTestCase, pytest_params -from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import execute_expression +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_case import BaseTestCase @dataclass(frozen=True) @@ -190,28 +193,39 @@ class ExprTest(BaseTestCase): TRIM_EQUIVALENCE_TESTS: list[ExprTest] = [ ExprTest( "trim_eq_ltrim_rtrim_default", - expr={"$eq": [ - {"$trim": {"input": " hello "}}, - {"$ltrim": {"input": {"$rtrim": {"input": " hello "}}}}, - ]}, + expr={ + "$eq": [ + {"$trim": {"input": " hello "}}, + {"$ltrim": {"input": {"$rtrim": {"input": " hello "}}}}, + ] + }, expected=True, msg="$trim should equal $ltrim($rtrim(x)) with default whitespace", ), ExprTest( "trim_eq_ltrim_rtrim_custom", - expr={"$eq": [ - {"$trim": {"input": "aaahelloaaa", "chars": "a"}}, - {"$ltrim": {"input": {"$rtrim": {"input": "aaahelloaaa", "chars": "a"}}, "chars": "a"}}, - ]}, + expr={ + "$eq": [ + {"$trim": {"input": "aaahelloaaa", "chars": "a"}}, + { + "$ltrim": { + "input": {"$rtrim": {"input": "aaahelloaaa", "chars": "a"}}, + "chars": "a", + } + }, + ] + }, expected=True, msg="$trim should equal $ltrim($rtrim(x)) with custom chars", ), ExprTest( "trim_eq_ltrim_rtrim_mixed_ws", - expr={"$eq": [ - {"$trim": {"input": " \t\nhello\n\t "}}, - {"$ltrim": {"input": {"$rtrim": {"input": " \t\nhello\n\t "}}}}, - ]}, + expr={ + "$eq": [ + {"$trim": {"input": " \t\nhello\n\t "}}, + {"$ltrim": {"input": {"$rtrim": {"input": " \t\nhello\n\t "}}}}, + ] + }, expected=True, msg="$trim should equal $ltrim($rtrim(x)) with mixed whitespace", ), diff --git a/documentdb_tests/framework/error_codes.py b/documentdb_tests/framework/error_codes.py index c285b5aa..70c23b77 100644 --- a/documentdb_tests/framework/error_codes.py +++ b/documentdb_tests/framework/error_codes.py @@ -4,11 +4,16 @@ """ DIVIDE_BY_ZERO_ERROR = 2 +FAILED_TO_PARSE_ERROR = 9 TYPE_MISMATCH_ERROR = 14 +BSON_TO_STRING_CONVERSION_ERROR = 16007 EXPRESSION_TYPE_MISMATCH_ERROR = 16020 +STRING_SIZE_LIMIT_ERROR = 16493 MODULO_ZERO_REMAINDER_ERROR = 16610 MODULO_NON_NUMERIC_ERROR = 16611 MORE_THAN_ONE_DATE_ERROR = 16612 +INVALID_DOLLAR_FIELD_PATH = 16872 +EXPRESSION_ARITY_ERROR = 28667 ABS_OVERFLOW_ERROR = 28680 LOG_NON_NUMERIC_VALUE_ERROR = 28756 LOG_NON_NUMERIC_BASE_ERROR = 28757 @@ -20,4 +25,12 @@ POW_BASE_ZERO_EXP_NEGATIVE_ERROR = 28764 NON_NUMERIC_TYPE_MISMATCH_ERROR = 28765 LN_NON_POSITIVE_INPUT_ERROR = 28766 +STRLENCP_TYPE_ERROR = 34471 +STRLENBYTES_TYPE_ERROR = 34473 +INDEXOFBYTES_STRING_TYPE_ERROR = 40091 +INDEXOFBYTES_SUBSTRING_TYPE_ERROR = 40092 +INDEXOFCP_STRING_TYPE_ERROR = 40093 +INDEXOFCP_SUBSTRING_TYPE_ERROR = 40094 +INDEXOF_INDEX_TYPE_ERROR = 40096 +INDEXOF_NEGATIVE_INDEX_ERROR = 40097 MODULO_DECIMAL128_ZERO_REMAINDER_ERROR = 5733415 diff --git a/documentdb_tests/framework/test_constants.py b/documentdb_tests/framework/test_constants.py index 9e3088ed..d0aa0200 100644 --- a/documentdb_tests/framework/test_constants.py +++ b/documentdb_tests/framework/test_constants.py @@ -64,6 +64,7 @@ # Other constant values MISSING = "$missing" +STRING_SIZE_LIMIT_BYTES = 16 * 1024 * 1024 # Int32 lists NUMERIC_INT32_NEGATIVE = [INT32_UNDERFLOW, INT32_MIN]