diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index ccbf21ac..37026f58 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -11,7 +11,7 @@ jobs: services: mongodb: - image: mongo:8.2 + image: mongo:8.2.4 ports: - 27017:27017 options: >- diff --git a/documentdb_tests/compatibility/tests/core/__init__.py b/documentdb_tests/compatibility/tests/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/__init__.py b/documentdb_tests/compatibility/tests/core/operator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substr/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substr/test_operator_substr.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substr/test_operator_substr.py new file mode 100644 index 00000000..195d7009 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substr/test_operator_substr.py @@ -0,0 +1,7 @@ +# $substr is a deprecated alias for $substrBytes. All tests in the $substrBytes +# test file are parametrized over both operator names. +import pytest + +SUBSTR_OPERATOR = pytest.param("$substr", id="substr") + +__all__ = ["SUBSTR_OPERATOR"] diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_core.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_core.py new file mode 100644 index 00000000..da4f7641 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_core.py @@ -0,0 +1,354 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + INT32_MAX, + INT32_MIN, + INT64_MAX, + INT64_MIN, +) + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Core Substring Extraction]: $substrBytes operates on zero-based UTF-8 byte positions and +# returns the substring starting at the given byte index for the specified number of bytes. +SUBSTRBYTES_CORE_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "core_full_string", + string="hello", + byte_index=0, + byte_count=5, + expected="hello", + msg="$substrBytes should extract full string", + ), + SubstrBytesTest( + "core_first_byte", + string="hello", + byte_index=0, + byte_count=1, + expected="h", + msg="$substrBytes should extract first byte", + ), + SubstrBytesTest( + "core_last_byte", + string="hello", + byte_index=4, + byte_count=1, + expected="o", + msg="$substrBytes should extract last byte", + ), + SubstrBytesTest( + "core_middle", + string="hello", + byte_index=1, + byte_count=3, + expected="ell", + msg="$substrBytes should extract middle bytes", + ), + SubstrBytesTest( + "core_offset_two", + string="hello", + byte_index=2, + byte_count=2, + expected="ll", + msg="$substrBytes should extract from offset two", + ), + SubstrBytesTest( + "core_ten_chars", + string="abcdefghij", + byte_index=0, + byte_count=10, + expected="abcdefghij", + msg="$substrBytes should extract ten characters", + ), + SubstrBytesTest( + "core_second_half", + string="abcdefghij", + byte_index=5, + byte_count=5, + expected="fghij", + msg="$substrBytes should extract second half", + ), + SubstrBytesTest( + "core_inner_slice", + string="abcdefghij", + byte_index=3, + byte_count=4, + expected="defg", + msg="$substrBytes should extract inner slice", + ), + SubstrBytesTest( + "core_single_char", + string="a", + byte_index=0, + byte_count=1, + expected="a", + msg="$substrBytes should extract single character", + ), +] + + +# Property [Negative Length]: any negative byte_count value returns the rest of the string from the +# start position to the end, regardless of the magnitude of the negative value. +SUBSTRBYTES_NEGATIVE_LENGTH_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "neg_len_minus_one", + string="hello", + byte_index=0, + byte_count=-1, + expected="hello", + msg="$substrBytes should return rest of string for length -1", + ), + SubstrBytesTest( + "neg_len_minus_100", + string="hello", + byte_index=0, + byte_count=-100, + expected="hello", + msg="$substrBytes should return rest of string for length -100", + ), + SubstrBytesTest( + "neg_len_int32_min", + string="hello", + byte_index=0, + byte_count=INT32_MIN, + expected="hello", + msg="$substrBytes should return rest of string for INT32_MIN length", + ), + SubstrBytesTest( + "neg_len_int64_min", + string="hello", + byte_index=0, + byte_count=INT64_MIN, + expected="hello", + msg="$substrBytes should return rest of string for INT64_MIN length", + ), + SubstrBytesTest( + "neg_len_from_middle", + string="hello", + byte_index=2, + byte_count=-1, + expected="llo", + msg="$substrBytes should return rest from middle with negative length", + ), + SubstrBytesTest( + "neg_len_from_last", + string="hello", + byte_index=4, + byte_count=-1, + expected="o", + msg="$substrBytes should return last byte with negative length", + ), + SubstrBytesTest( + "neg_len_at_end", + string="hello", + byte_index=5, + byte_count=-1, + expected="", + msg="$substrBytes should return empty string at end with negative length", + ), + SubstrBytesTest( + "neg_len_beyond_end", + string="hello", + byte_index=10, + byte_count=-1, + expected="", + msg="$substrBytes should return empty string beyond end with negative length", + ), + SubstrBytesTest( + "neg_len_empty_string", + string="", + byte_index=0, + byte_count=-1, + expected="", + msg="$substrBytes should return empty string for empty input with negative length", + ), + # Multi-byte string with negative length returns full string (no mid-character end error). + SubstrBytesTest( + "neg_len_multibyte", + string="aé中😀", + byte_index=0, + byte_count=-1, + expected="aé中😀", + msg="$substrBytes should return full multi-byte string with negative length", + ), +] + + +# Property [Boundary Behavior]: start index at or beyond the string byte length returns empty +# string, byte count exceeding remaining bytes clamps to the end, and empty or very large strings +# are handled correctly at all positions. +SUBSTRBYTES_BOUNDARY_TESTS: list[SubstrBytesTest] = [ + # Start at exactly the string byte length. + SubstrBytesTest( + "boundary_start_at_end", + string="hello", + byte_index=5, + byte_count=1, + expected="", + msg="$substrBytes should return empty when start equals string length", + ), + # Start beyond the string byte length. + SubstrBytesTest( + "boundary_start_beyond", + string="hello", + byte_index=6, + byte_count=1, + expected="", + msg="$substrBytes should return empty when start exceeds string length", + ), + SubstrBytesTest( + "boundary_start_int32_max", + string="hello", + byte_index=INT32_MAX, + byte_count=1, + expected="", + msg="$substrBytes should return empty for INT32_MAX start", + ), + SubstrBytesTest( + "boundary_start_int64_max", + string="hello", + byte_index=INT64_MAX, + byte_count=1, + expected="", + msg="$substrBytes should return empty for INT64_MAX start", + ), + # Byte count exceeding remaining bytes clamps to end. + SubstrBytesTest( + "boundary_count_exceeds", + string="hello", + byte_index=0, + byte_count=100, + expected="hello", + msg="$substrBytes should clamp count exceeding string length", + ), + SubstrBytesTest( + "boundary_count_exceeds_mid", + string="hello", + byte_index=3, + byte_count=100, + expected="lo", + msg="$substrBytes should clamp count exceeding remaining bytes from middle", + ), + # Multi-byte string with count exceeding byte length clamps to end. + SubstrBytesTest( + "boundary_count_exceeds_multibyte", + string="aé中😀", + byte_index=0, + byte_count=100, + expected="aé中😀", + msg="$substrBytes should clamp count exceeding multi-byte string length", + ), + # int32 max and int64 max accepted as byte_count. + SubstrBytesTest( + "boundary_count_int32_max", + string="hello", + byte_index=0, + byte_count=INT32_MAX, + expected="hello", + msg="$substrBytes should accept INT32_MAX as byte count", + ), + SubstrBytesTest( + "boundary_count_int64_max", + string="hello", + byte_index=0, + byte_count=INT64_MAX, + expected="hello", + msg="$substrBytes should accept INT64_MAX as byte count", + ), + # Zero byte count at various positions. + SubstrBytesTest( + "boundary_zero_count_start", + string="hello", + byte_index=0, + byte_count=0, + expected="", + msg="$substrBytes should return empty for zero count at start", + ), + SubstrBytesTest( + "boundary_zero_count_mid", + string="hello", + byte_index=3, + byte_count=0, + expected="", + msg="$substrBytes should return empty for zero count at middle", + ), + SubstrBytesTest( + "boundary_zero_count_at_end", + string="hello", + byte_index=5, + byte_count=0, + expected="", + msg="$substrBytes should return empty for zero count at end", + ), + # Empty string with various parameters. + SubstrBytesTest( + "boundary_empty_zero_zero", + string="", + byte_index=0, + byte_count=0, + expected="", + msg="$substrBytes should return empty for empty string with zero params", + ), + SubstrBytesTest( + "boundary_empty_positive_len", + string="", + byte_index=0, + byte_count=5, + expected="", + msg="$substrBytes should return empty for empty string with positive length", + ), + SubstrBytesTest( + "boundary_empty_beyond_start", + string="", + byte_index=5, + byte_count=5, + expected="", + msg="$substrBytes should return empty for empty string with start beyond length", + ), + # Large start + large count whose sum would overflow int64 produces no error. + SubstrBytesTest( + "boundary_overflow_start_count_sum", + string="hello", + byte_index=INT64_MAX, + byte_count=INT64_MAX, + expected="", + msg="$substrBytes should handle INT64_MAX start and count without overflow error", + ), + SubstrBytesTest( + "boundary_overflow_int32_max_start_int64_max_count", + string="hello", + byte_index=INT32_MAX, + byte_count=INT64_MAX, + expected="", + msg="$substrBytes should handle INT32_MAX start and INT64_MAX count without overflow error", + ), +] + + +SUBSTRBYTES_CORE_ALL_TESTS = ( + SUBSTRBYTES_CORE_TESTS + SUBSTRBYTES_NEGATIVE_LENGTH_TESTS + SUBSTRBYTES_BOUNDARY_TESTS +) + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_CORE_ALL_TESTS)) +def test_substrbytes_core(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_encoding.py new file mode 100644 index 00000000..1303f5b6 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_encoding.py @@ -0,0 +1,323 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Encoding]: characters of various byte widths and special byte values are correctly +# extracted when start and length align with byte boundaries. +SUBSTRBYTES_ENCODING_TESTS: list[SubstrBytesTest] = [ + # 2-byte characters (U+03C3 σ). + SubstrBytesTest( + "encoding_2byte_sigma", + string="σ", + byte_index=0, + byte_count=2, + expected="σ", + msg="$substrBytes should extract 2-byte character", + ), + # 3-byte characters (U+4E2D 中). + SubstrBytesTest( + "encoding_3byte_zhong", + string="中", + byte_index=0, + byte_count=3, + expected="中", + msg="$substrBytes should extract 3-byte character", + ), + # 4-byte characters (U+1F600 😀, U+10400 Deseret 𐐀). + SubstrBytesTest( + "encoding_4byte_grinning", + string="😀", + byte_index=0, + byte_count=4, + expected="😀", + msg="$substrBytes should extract 4-byte emoji", + ), + SubstrBytesTest( + "encoding_4byte_deseret", + string="\U00010400", + byte_index=0, + byte_count=4, + expected="\U00010400", + msg="$substrBytes should extract 4-byte Deseret character", + ), + # Mixed ASCII and multi-byte: "aé中😀" = 1 + 2 + 3 + 4 = 10 bytes. + SubstrBytesTest( + "encoding_mixed_ascii", + string="aé中😀", + byte_index=0, + byte_count=1, + expected="a", + msg="$substrBytes should extract ASCII byte from mixed string", + ), + SubstrBytesTest( + "encoding_mixed_2byte", + string="aé中😀", + byte_index=1, + byte_count=2, + expected="é", + msg="$substrBytes should extract 2-byte char from mixed string", + ), + SubstrBytesTest( + "encoding_mixed_3byte", + string="aé中😀", + byte_index=3, + byte_count=3, + expected="中", + msg="$substrBytes should extract 3-byte char from mixed string", + ), + SubstrBytesTest( + "encoding_mixed_4byte", + string="aé中😀", + byte_index=6, + byte_count=4, + expected="😀", + msg="$substrBytes should extract 4-byte char from mixed string", + ), + # Extract multiple multi-byte characters in one slice. + SubstrBytesTest( + "encoding_mixed_2byte_and_3byte", + string="aé中😀", + byte_index=1, + byte_count=5, + expected="é中", + msg="$substrBytes should extract 2-byte and 3-byte chars in one slice", + ), + SubstrBytesTest( + "encoding_mixed_3byte_and_4byte", + string="aé中😀", + byte_index=3, + byte_count=7, + expected="中😀", + msg="$substrBytes should extract 3-byte and 4-byte chars in one slice", + ), + SubstrBytesTest( + "encoding_mixed_full", + string="aé中😀", + byte_index=0, + byte_count=10, + expected="aé中😀", + msg="$substrBytes should extract full mixed multi-byte string", + ), + # Precomposed é (U+00E9) is 2 bytes. + SubstrBytesTest( + "encoding_precomposed_e_accent", + string="\u00e9", + byte_index=0, + byte_count=2, + expected="\u00e9", + msg="$substrBytes should extract precomposed 2-byte character", + ), + # Decomposed e + combining acute (U+0301) is 3 bytes total. + SubstrBytesTest( + "encoding_decomposed_e_accent", + string="e\u0301", + byte_index=0, + byte_count=3, + expected="e\u0301", + msg="$substrBytes should extract decomposed 3-byte sequence", + ), + # Null bytes are preserved and do not terminate the string. + SubstrBytesTest( + "encoding_null_byte_full", + string="a\x00b", + byte_index=0, + byte_count=3, + expected="a\x00b", + msg="$substrBytes should preserve null bytes", + ), + SubstrBytesTest( + "encoding_null_byte_after", + string="a\x00b", + byte_index=2, + byte_count=1, + expected="b", + msg="$substrBytes should extract byte after null byte", + ), + SubstrBytesTest( + "encoding_null_byte_extract", + string="a\x00b", + byte_index=1, + byte_count=1, + expected="\x00", + msg="$substrBytes should extract null byte itself", + ), + # Control characters are preserved. + SubstrBytesTest( + "encoding_control_chars", + string="\x01\x02\x1f", + byte_index=0, + byte_count=3, + expected="\x01\x02\x1f", + msg="$substrBytes should preserve control characters", + ), + SubstrBytesTest( + "encoding_control_char_mid", + string="\x01\x02\x1f", + byte_index=1, + byte_count=1, + expected="\x02", + msg="$substrBytes should extract control character from middle", + ), + # Whitespace characters are preserved and extractable. + SubstrBytesTest( + "encoding_whitespace", + string=" \t\n\r", + byte_index=0, + byte_count=4, + expected=" \t\n\r", + msg="$substrBytes should preserve whitespace characters", + ), + SubstrBytesTest( + "encoding_newline", + string=" \t\n\r", + byte_index=2, + byte_count=1, + expected="\n", + msg="$substrBytes should extract newline character", + ), + # Unicode whitespace: NBSP U+00A0 (2 bytes), en space U+2000 (3 bytes). + SubstrBytesTest( + "encoding_nbsp", + string="\u00a0", + byte_index=0, + byte_count=2, + expected="\u00a0", + msg="$substrBytes should extract 2-byte NBSP", + ), + SubstrBytesTest( + "encoding_en_space", + string="\u2000", + byte_index=0, + byte_count=3, + expected="\u2000", + msg="$substrBytes should extract 3-byte en space", + ), + # Zero-width characters: ZWSP U+200B (3 bytes), ZWJ U+200D (3 bytes). + SubstrBytesTest( + "encoding_zwsp", + string="\u200b", + byte_index=0, + byte_count=3, + expected="\u200b", + msg="$substrBytes should extract 3-byte zero-width space", + ), + SubstrBytesTest( + "encoding_zwj", + string="\u200d", + byte_index=0, + byte_count=3, + expected="\u200d", + msg="$substrBytes should extract 3-byte zero-width joiner", + ), + # BOM U+FEFF is preserved and occupies 3 bytes. + SubstrBytesTest( + "encoding_bom", + string="\ufeff", + byte_index=0, + byte_count=3, + expected="\ufeff", + msg="$substrBytes should extract 3-byte BOM", + ), + # BSON/JSON-significant characters are treated as data. + SubstrBytesTest( + "encoding_bson_json_chars", + string='{}"$\\[]', + byte_index=0, + byte_count=7, + expected='{}"$\\[]', + msg="$substrBytes should treat BSON/JSON characters as data", + ), + SubstrBytesTest( + "encoding_bson_json_dollar", + string='{}"$\\[]', + byte_index=3, + byte_count=1, + expected="$", + msg="$substrBytes should extract dollar sign from BSON/JSON string", + ), + # $literal prevents $-prefixed strings from being misinterpreted as field references. + SubstrBytesTest( + "encoding_literal_dollar_prefix", + string={"$literal": "$field"}, + byte_index=0, + byte_count=6, + expected="$field", + msg="$substrBytes should handle $literal dollar-prefixed string", + ), +] + +# Property [Grapheme Splitting]: the operator splits at byte boundaries, not grapheme cluster +# boundaries, so combining marks and ZWJ emoji components are extracted independently. +SUBSTRBYTES_GRAPHEME_SPLIT_TESTS: list[SubstrBytesTest] = [ + # Base character extracted independently of following combining mark. + SubstrBytesTest( + "grapheme_base_without_combining", + string="e\u0301", + byte_index=0, + byte_count=1, + expected="e", + msg="$substrBytes should extract base character without combining mark", + ), + # Combining mark extracted alone. + SubstrBytesTest( + "grapheme_combining_mark_alone", + string="e\u0301", + byte_index=1, + byte_count=2, + expected="\u0301", + msg="$substrBytes should extract combining mark alone", + ), + # ZWJ emoji sequence: 👨 (4 bytes) + ZWJ U+200D (3 bytes) + 👩 (4 bytes) = 11 bytes. + SubstrBytesTest( + "grapheme_zwj_emoji_first", + string="👨\u200d👩", + byte_index=0, + byte_count=4, + expected="👨", + msg="$substrBytes should extract first emoji from ZWJ sequence", + ), + SubstrBytesTest( + "grapheme_zwj_emoji_joiner", + string="👨\u200d👩", + byte_index=4, + byte_count=3, + expected="\u200d", + msg="$substrBytes should extract ZWJ joiner from emoji sequence", + ), + SubstrBytesTest( + "grapheme_zwj_emoji_second", + string="👨\u200d👩", + byte_index=7, + byte_count=4, + expected="👩", + msg="$substrBytes should extract second emoji from ZWJ sequence", + ), +] + + +SUBSTRBYTES_ENCODING_ALL_TESTS = SUBSTRBYTES_ENCODING_TESTS + SUBSTRBYTES_GRAPHEME_SPLIT_TESTS + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_ENCODING_ALL_TESTS)) +def test_substrbytes_encoding(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_invalid_args.py new file mode 100644 index 00000000..1c9610e7 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_invalid_args.py @@ -0,0 +1,683 @@ +from __future__ import annotations + +import pytest +from bson import ( + Decimal128, + Int64, +) + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_TO_STRING_CONVERSION_ERROR, + EXPRESSION_TYPE_MISMATCH_ERROR, + FAILED_TO_PARSE_ERROR, + FIELD_PATH_NULL_BYTE_ERROR, + INVALID_DOLLAR_FIELD_PATH, + OUT_OF_RANGE_CONVERSION_ERROR, + SUBSTR_CONTINUATION_BYTE_START_ERROR, + SUBSTR_LENGTH_TYPE_ERROR, + SUBSTR_MID_CHARACTER_END_ERROR, + SUBSTR_NEGATIVE_START_ERROR, + SUBSTR_START_TYPE_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_LARGE_EXPONENT, + DECIMAL128_NEGATIVE_INFINITY, + DOUBLE_NEAR_MAX, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT64_MIN, +) + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Fractional Negative Boundary for Index - Error]: fractional values that round or +# truncate to -1 or below produce a negative start error. These are separated from +# SUBSTRBYTES_NEGATIVE_START_TESTS because they pair with the success-side fractional boundary +# tests that verify values just above the rounding threshold. +SUBSTRBYTES_FRAC_NEG_IDX_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "frac_neg_idx_double_minus_1_0", + string="hello", + byte_index=-1.0, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject double -1.0 as start index", + ), + SubstrBytesTest( + "frac_neg_idx_decimal_minus_0_6", + string="hello", + byte_index=Decimal128("-0.6"), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 -0.6 as start index (rounds to -1)", + ), +] + +# Property [Negative Start Error]: a negative byte_index value after numeric coercion produces an +# error, including Decimal128 special values that overflow to int64 min. +SUBSTRBYTES_NEGATIVE_START_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "neg_start_int32", + string="hello", + byte_index=-1, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject negative int32 start", + ), + SubstrBytesTest( + "neg_start_int32_large", + string="hello", + byte_index=-100, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject large negative int32 start", + ), + SubstrBytesTest( + "neg_start_int64", + string="hello", + byte_index=Int64(-1), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject negative int64 start", + ), + SubstrBytesTest( + "neg_start_int64_min", + string="hello", + byte_index=INT64_MIN, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject INT64_MIN start", + ), + SubstrBytesTest( + "neg_start_decimal_large", + string="hello", + byte_index=Decimal128("-100"), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject large negative Decimal128 start", + ), + # Decimal128 special values overflow to int64 min, triggering negative start error. + SubstrBytesTest( + "neg_start_decimal_nan", + string="hello", + byte_index=Decimal128("NaN"), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 NaN start as negative overflow", + ), + SubstrBytesTest( + "neg_start_decimal_inf", + string="hello", + byte_index=DECIMAL128_INFINITY, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 Infinity start as negative overflow", + ), + SubstrBytesTest( + "neg_start_decimal_neg_inf", + string="hello", + byte_index=DECIMAL128_NEGATIVE_INFINITY, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 -Infinity start as negative overflow", + ), + SubstrBytesTest( + "neg_start_decimal_overflow", + string="hello", + byte_index=DECIMAL128_LARGE_EXPONENT, + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 1E+6144 start as negative overflow", + ), + SubstrBytesTest( + "neg_start_decimal_max", + string="hello", + byte_index=Decimal128("9999999999999999999999999999999999E+6111"), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 max value start as negative overflow", + ), + SubstrBytesTest( + "neg_start_decimal_34_digit", + string="hello", + byte_index=Decimal128("9999999999999999999999999999999999"), + byte_count=3, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes should reject Decimal128 34-digit start as negative overflow", + ), +] + +# Property [Double Coercion Out-of-Range Error]: double NaN, infinity, negative infinity, and +# values outside int64 range cannot be coerced to long for either byte_index or byte_count. +SUBSTRBYTES_DOUBLE_COERCION_ERROR_TESTS: list[SubstrBytesTest] = [ + # byte_index. + SubstrBytesTest( + "double_coerce_start_nan", + string="hello", + byte_index=FLOAT_NAN, + byte_count=3, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double NaN start", + ), + SubstrBytesTest( + "double_coerce_start_inf", + string="hello", + byte_index=FLOAT_INFINITY, + byte_count=3, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double Infinity start", + ), + SubstrBytesTest( + "double_coerce_start_neg_inf", + string="hello", + byte_index=FLOAT_NEGATIVE_INFINITY, + byte_count=3, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double -Infinity start", + ), + SubstrBytesTest( + "double_coerce_start_1e20", + string="hello", + byte_index=1e20, + byte_count=3, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double 1e20 start as out of range", + ), + SubstrBytesTest( + "double_coerce_start_1e308", + string="hello", + byte_index=DOUBLE_NEAR_MAX, + byte_count=3, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject near-max double start as out of range", + ), + # byte_count. + SubstrBytesTest( + "double_coerce_count_nan", + string="hello", + byte_index=0, + byte_count=FLOAT_NAN, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double NaN count", + ), + SubstrBytesTest( + "double_coerce_count_inf", + string="hello", + byte_index=0, + byte_count=FLOAT_INFINITY, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double Infinity count", + ), + SubstrBytesTest( + "double_coerce_count_neg_inf", + string="hello", + byte_index=0, + byte_count=FLOAT_NEGATIVE_INFINITY, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double -Infinity count", + ), + SubstrBytesTest( + "double_coerce_count_1e20", + string="hello", + byte_index=0, + byte_count=1e20, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject double 1e20 count as out of range", + ), + SubstrBytesTest( + "double_coerce_count_1e308", + string="hello", + byte_index=0, + byte_count=DOUBLE_NEAR_MAX, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes should reject near-max double count as out of range", + ), +] + +# Property [UTF-8 Continuation Byte Start Error]: starting at a byte offset that is a UTF-8 +# continuation byte produces an error, even with negative byte_count. +SUBSTRBYTES_CONTINUATION_START_TESTS: list[SubstrBytesTest] = [ + # 2-byte character é (U+00E9): continuation byte at offset 1. + SubstrBytesTest( + "cont_start_2byte_pos1", + string="é", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at continuation byte of 2-byte char", + ), + # 3-byte character 中 (U+4E2D): continuation bytes at offsets 1 and 2. + SubstrBytesTest( + "cont_start_3byte_pos1", + string="中", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at first continuation byte of 3-byte char", + ), + SubstrBytesTest( + "cont_start_3byte_pos2", + string="中", + byte_index=2, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at second continuation byte of 3-byte char", + ), + # 4-byte character 😀 (U+1F600): continuation bytes at offsets 1, 2, and 3. + SubstrBytesTest( + "cont_start_4byte_pos1", + string="😀", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at first continuation byte of 4-byte char", + ), + SubstrBytesTest( + "cont_start_4byte_pos2", + string="😀", + byte_index=2, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at second continuation byte of 4-byte char", + ), + SubstrBytesTest( + "cont_start_4byte_pos3", + string="😀", + byte_index=3, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at third continuation byte of 4-byte char", + ), + # Combining mark U+0301 (2 bytes starting at offset 1 in "e\u0301"). + SubstrBytesTest( + "cont_start_combining_mark", + string="e\u0301", + byte_index=2, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at continuation byte of combining mark", + ), + # BOM U+FEFF (3 bytes): continuation bytes at offsets 1 and 2. + SubstrBytesTest( + "cont_start_bom_pos1", + string="\ufeff", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at continuation byte of BOM", + ), + # ZWJ U+200D (3 bytes): continuation byte at offset 1. + SubstrBytesTest( + "cont_start_zwj_pos1", + string="\u200d", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject start at continuation byte of ZWJ", + ), + # Negative byte_count does not bypass the check. + SubstrBytesTest( + "cont_start_neg_count", + string="é", + byte_index=1, + byte_count=-1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes should reject continuation byte start even with negative count", + ), +] + +# Property [UTF-8 Mid-Character End Error]: when start + byte_count lands in the middle of a +# multi-byte UTF-8 character, an error is produced. +SUBSTRBYTES_MID_CHAR_END_TESTS: list[SubstrBytesTest] = [ + # 2-byte character é: byte_index=0, byte_count=1 lands mid-character. + SubstrBytesTest( + "mid_end_2byte_pos1", + string="é", + byte_index=0, + byte_count=1, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length landing mid 2-byte character", + ), + # 3-byte character 中: byte_count=1 and byte_count=2 both land mid-character. + SubstrBytesTest( + "mid_end_3byte_pos1", + string="中", + byte_index=0, + byte_count=1, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length 1 landing mid 3-byte character", + ), + SubstrBytesTest( + "mid_end_3byte_pos2", + string="中", + byte_index=0, + byte_count=2, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length 2 landing mid 3-byte character", + ), + # 4-byte character 😀: byte_count=1, 2, 3 all land mid-character. + SubstrBytesTest( + "mid_end_4byte_pos1", + string="😀", + byte_index=0, + byte_count=1, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length 1 landing mid 4-byte character", + ), + SubstrBytesTest( + "mid_end_4byte_pos2", + string="😀", + byte_index=0, + byte_count=2, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length 2 landing mid 4-byte character", + ), + SubstrBytesTest( + "mid_end_4byte_pos3", + string="😀", + byte_index=0, + byte_count=3, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length 3 landing mid 4-byte character", + ), + # Mixed string "café" = 3 ASCII + 2-byte é = 5 bytes. Length 4 lands mid-é. + SubstrBytesTest( + "mid_end_mixed_cafe", + string="café", + byte_index=0, + byte_count=4, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject length landing mid-character in mixed string", + ), + # Fractional double truncation causes mid-character end: "café", double(4.5) truncates to 4. + SubstrBytesTest( + "mid_end_double_trunc", + string="café", + byte_index=0, + byte_count=4.5, + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject truncated double length landing mid-character", + ), + # Decimal128 rounding causes mid-character end: "café", Decimal128("3.5") rounds to 4. + SubstrBytesTest( + "mid_end_decimal_round", + string="café", + byte_index=0, + byte_count=Decimal128("3.5"), + error_code=SUBSTR_MID_CHARACTER_END_ERROR, + msg="$substrBytes should reject rounded Decimal128 length landing mid-character", + ), +] + + +# Property [Error Precedence]: errors are evaluated in priority order: string type before index +# type before count type before double coercion before negative start before mid-char start before +# mid-char end. +SUBSTRBYTES_PRECEDENCE_TESTS: list[SubstrBytesTest] = [ + # String type error takes precedence over index type error. + SubstrBytesTest( + "prec_string_over_index", + string=True, + byte_index="bad", + byte_count=3, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes string type error should precede index type error", + ), + # String type error takes precedence over count type error. + SubstrBytesTest( + "prec_string_over_count", + string=True, + byte_index=0, + byte_count="bad", + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes string type error should precede count type error", + ), + # String type error takes precedence when all three params are invalid. + SubstrBytesTest( + "prec_string_over_all", + string=True, + byte_index="bad", + byte_count="bad", + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes string type error should precede all other errors", + ), + # Index type error takes precedence over count type error. + SubstrBytesTest( + "prec_index_over_count", + string="hello", + byte_index="bad", + byte_count="bad", + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes index type error should precede count type error", + ), + # Index type error (null) takes precedence over negative count behavior. + SubstrBytesTest( + "prec_index_null_over_neg_count", + string="hello", + byte_index=None, + byte_count=-1, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes null index error should precede negative count behavior", + ), + # Count type error (null) takes precedence over negative start error. + SubstrBytesTest( + "prec_count_null_over_neg_start", + string="hello", + byte_index=-1, + byte_count=None, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes null count error should precede negative start error", + ), + # Double coercion error takes precedence over negative start. + SubstrBytesTest( + "prec_double_coerce_over_neg_start", + string="hello", + byte_index=FLOAT_NAN, + byte_count=-1, + error_code=OUT_OF_RANGE_CONVERSION_ERROR, + msg="$substrBytes double coercion error should precede negative start error", + ), + # Negative start error takes precedence over mid-char start. + SubstrBytesTest( + "prec_neg_start_over_cont_start", + string="é", + byte_index=-1, + byte_count=1, + error_code=SUBSTR_NEGATIVE_START_ERROR, + msg="$substrBytes negative start error should precede continuation byte error", + ), + # Mid-char start error takes precedence over mid-char end. + SubstrBytesTest( + "prec_cont_start_over_mid_end", + string="éé", + byte_index=1, + byte_count=1, + error_code=SUBSTR_CONTINUATION_BYTE_START_ERROR, + msg="$substrBytes continuation byte error should precede mid-character end error", + ), + # Null-string-returns-empty only applies when index and count are valid. Count type error fires + # even when string is null. + SubstrBytesTest( + "prec_count_type_over_null_string", + string=None, + byte_index=0, + byte_count="bad", + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes count type error should fire even when string is null", + ), + # Index type error fires even when string is null. + SubstrBytesTest( + "prec_index_type_over_null_string", + string=None, + byte_index="bad", + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes index type error should fire even when string is null", + ), +] + +# Property [Arity Errors]: $substrBytes requires exactly 3 arguments in an array; fewer, more, or +# non-array shapes produce an error. +SUBSTRBYTES_ARITY_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "arity_zero_args", + raw_args=[], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject empty array", + ), + SubstrBytesTest( + "arity_one_arg", + raw_args=["hello"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject single-element array", + ), + SubstrBytesTest( + "arity_two_args", + raw_args=["hello", 0], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject two-element array", + ), + SubstrBytesTest( + "arity_four_args", + raw_args=["hello", 0, 5, 1], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject four-element array", + ), + SubstrBytesTest( + "arity_bare_string", + raw_args="hello", + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject bare string argument", + ), + SubstrBytesTest( + "arity_bare_object", + raw_args={"a": 1}, + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject bare object argument", + ), + SubstrBytesTest( + "arity_bare_number", + raw_args=42, + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject bare number argument", + ), + SubstrBytesTest( + "arity_bare_null", + raw_args=None, + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrBytes should reject bare null argument", + ), +] + +# Property [Field Path Parsing]: bare "$", "$$", and null-byte field paths produce parse-time +# errors in each parameter position. +SUBSTRBYTES_FIELD_PATH_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "fieldpath_bare_dollar_string", + string="$", + byte_index=0, + byte_count=1, + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrBytes should reject bare '$' as string parameter", + ), + SubstrBytesTest( + "fieldpath_double_dollar_string", + string="$$", + byte_index=0, + byte_count=1, + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrBytes should reject '$$' as string parameter", + ), + SubstrBytesTest( + "fieldpath_null_byte_string", + string="$a\x00b", + byte_index=0, + byte_count=1, + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrBytes should reject field path with null byte as string parameter", + ), + SubstrBytesTest( + "fieldpath_bare_dollar_index", + string="hello", + byte_index="$", + byte_count=1, + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrBytes should reject bare '$' as byte_index", + ), + SubstrBytesTest( + "fieldpath_double_dollar_index", + string="hello", + byte_index="$$", + byte_count=1, + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrBytes should reject '$$' as byte_index", + ), + SubstrBytesTest( + "fieldpath_null_byte_index", + string="hello", + byte_index="$a\x00b", + byte_count=1, + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrBytes should reject field path with null byte as byte_index", + ), + SubstrBytesTest( + "fieldpath_bare_dollar_count", + string="hello", + byte_index=0, + byte_count="$", + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrBytes should reject bare '$' as byte_count", + ), + SubstrBytesTest( + "fieldpath_double_dollar_count", + string="hello", + byte_index=0, + byte_count="$$", + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrBytes should reject '$$' as byte_count", + ), + SubstrBytesTest( + "fieldpath_null_byte_count", + string="hello", + byte_index=0, + byte_count="$a\x00b", + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrBytes should reject field path with null byte as byte_count", + ), +] + + +SUBSTRBYTES_INVALID_ARGS_ALL_TESTS = ( + SUBSTRBYTES_FRAC_NEG_IDX_ERROR_TESTS + + SUBSTRBYTES_NEGATIVE_START_TESTS + + SUBSTRBYTES_DOUBLE_COERCION_ERROR_TESTS + + SUBSTRBYTES_CONTINUATION_START_TESTS + + SUBSTRBYTES_MID_CHAR_END_TESTS + + SUBSTRBYTES_PRECEDENCE_TESTS + + SUBSTRBYTES_ARITY_ERROR_TESTS + + SUBSTRBYTES_FIELD_PATH_ERROR_TESTS +) + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_INVALID_ARGS_ALL_TESTS)) +def test_substrbytes_invalid_args(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_null.py new file mode 100644 index 00000000..ab7d5521 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_null.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import MISSING + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Null and Missing in String Position]: when the string parameter is null, missing, or +# undefined, the result is an empty string "". +SUBSTRBYTES_NULL_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "null_string", + string=None, + byte_index=0, + byte_count=5, + expected="", + msg="$substrBytes should return empty string for null string parameter", + ), + SubstrBytesTest( + "missing_string", + string=MISSING, + byte_index=0, + byte_count=5, + expected="", + msg="$substrBytes should return empty string for missing string parameter", + ), + SubstrBytesTest( + "null_expression", + string={"$literal": None}, + byte_index=0, + byte_count=5, + expected="", + msg="$substrBytes should return empty string for null expression", + ), +] + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_NULL_TESTS)) +def test_substrbytes_null(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_numeric_coercion.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_numeric_coercion.py new file mode 100644 index 00000000..0b851f54 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_numeric_coercion.py @@ -0,0 +1,443 @@ +from __future__ import annotations + +import pytest +from bson import Decimal128, Int64 + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_HALF, + DECIMAL128_INFINITY, + DECIMAL128_LARGE_EXPONENT, + DECIMAL128_NEGATIVE_HALF, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_NEGATIVE_ONE_AND_HALF, + DECIMAL128_NEGATIVE_ZERO, + DECIMAL128_ONE_AND_HALF, + DECIMAL128_SMALL_EXPONENT, + DECIMAL128_TWO_AND_HALF, + DOUBLE_MIN_NEGATIVE_SUBNORMAL, + DOUBLE_MIN_SUBNORMAL, + DOUBLE_NEGATIVE_ZERO, +) + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Numeric Coercion for Index and Count]: int32, int64, double, and Decimal128 are all +# accepted for byte_index and byte_count, with fractional doubles truncated toward zero and +# fractional Decimal128 values rounded using banker's rounding (round half to even). +SUBSTRBYTES_NUMERIC_COERCION_TESTS: list[SubstrBytesTest] = [ + # Each numeric type accepted for both parameters. + SubstrBytesTest( + "coerce_idx_int64", + string="hello", + byte_index=Int64(1), + byte_count=Int64(3), + expected="ell", + msg="$substrBytes should accept int64 for both parameters", + ), + SubstrBytesTest( + "coerce_idx_double", + string="hello", + byte_index=1.0, + byte_count=3.0, + expected="ell", + msg="$substrBytes should accept double for both parameters", + ), + SubstrBytesTest( + "coerce_idx_decimal", + string="hello", + byte_index=Decimal128("1"), + byte_count=Decimal128("3"), + expected="ell", + msg="$substrBytes should accept Decimal128 for both parameters", + ), + # Mixed numeric types across parameters. + SubstrBytesTest( + "coerce_mixed_int32_decimal", + string="hello", + byte_index=1, + byte_count=Decimal128("3"), + expected="ell", + msg="$substrBytes should accept mixed int32 start and Decimal128 length", + ), + SubstrBytesTest( + "coerce_mixed_int64_double", + string="hello", + byte_index=Int64(1), + byte_count=2.0, + expected="el", + msg="$substrBytes should accept mixed int64 start and double length", + ), + SubstrBytesTest( + "coerce_mixed_double_int64", + string="hello", + byte_index=1.0, + byte_count=Int64(3), + expected="ell", + msg="$substrBytes should accept mixed double start and int64 length", + ), + SubstrBytesTest( + "coerce_mixed_decimal_int32", + string="hello", + byte_index=Decimal128("1"), + byte_count=3, + expected="ell", + msg="$substrBytes should accept mixed Decimal128 start and int32 length", + ), + # Fractional double truncated toward zero (C-style cast). + SubstrBytesTest( + "coerce_double_trunc_start", + string="hello", + byte_index=1.9, + byte_count=3, + expected="ell", + msg="$substrBytes should truncate fractional double start toward zero", + ), + SubstrBytesTest( + "coerce_double_trunc_start_half", + string="hello", + byte_index=0.5, + byte_count=5, + expected="hello", + msg="$substrBytes should truncate 0.5 start to 0", + ), + SubstrBytesTest( + "coerce_double_trunc_length", + string="hello", + byte_index=0, + byte_count=1.9, + expected="h", + msg="$substrBytes should truncate fractional double length toward zero", + ), + SubstrBytesTest( + "coerce_double_trunc_length_half", + string="hello", + byte_index=0, + byte_count=0.5, + expected="", + msg="$substrBytes should truncate 0.5 length to 0", + ), + # Fractional Decimal128 uses banker's rounding (round half to even). + SubstrBytesTest( + "coerce_decimal_round_start_0_5", + string="hello", + byte_index=DECIMAL128_HALF, + byte_count=5, + expected="hello", + msg="$substrBytes should round Decimal128 0.5 start to 0 (even)", + ), + SubstrBytesTest( + "coerce_decimal_round_start_1_5", + string="hello", + byte_index=DECIMAL128_ONE_AND_HALF, + byte_count=3, + expected="llo", + msg="$substrBytes should round Decimal128 1.5 start to 2 (even)", + ), + SubstrBytesTest( + "coerce_decimal_round_start_2_5", + string="hello", + byte_index=DECIMAL128_TWO_AND_HALF, + byte_count=3, + expected="llo", + msg="$substrBytes should round Decimal128 2.5 start to 2 (even)", + ), + SubstrBytesTest( + "coerce_decimal_round_length_0_5", + string="hello", + byte_index=0, + byte_count=DECIMAL128_HALF, + expected="", + msg="$substrBytes should round Decimal128 0.5 length to 0 (even)", + ), + SubstrBytesTest( + "coerce_decimal_round_length_1_5", + string="hello", + byte_index=0, + byte_count=DECIMAL128_ONE_AND_HALF, + expected="he", + msg="$substrBytes should round Decimal128 1.5 length to 2 (even)", + ), + SubstrBytesTest( + "coerce_decimal_round_length_2_5", + string="hello", + byte_index=0, + byte_count=DECIMAL128_TWO_AND_HALF, + expected="he", + msg="$substrBytes should round Decimal128 2.5 length to 2 (even)", + ), + # Double -0.0 treated as 0. + SubstrBytesTest( + "coerce_double_neg_zero_start", + string="hello", + byte_index=DOUBLE_NEGATIVE_ZERO, + byte_count=3, + expected="hel", + msg="$substrBytes should treat double -0.0 start as 0", + ), + SubstrBytesTest( + "coerce_double_neg_zero_length", + string="hello", + byte_index=0, + byte_count=DOUBLE_NEGATIVE_ZERO, + expected="", + msg="$substrBytes should treat double -0.0 length as 0", + ), + # Decimal128 "-0" treated as 0. + SubstrBytesTest( + "coerce_decimal_neg_zero_start", + string="hello", + byte_index=DECIMAL128_NEGATIVE_ZERO, + byte_count=3, + expected="hel", + msg="$substrBytes should treat Decimal128 -0 start as 0", + ), + SubstrBytesTest( + "coerce_decimal_neg_zero_length", + string="hello", + byte_index=0, + byte_count=DECIMAL128_NEGATIVE_ZERO, + expected="", + msg="$substrBytes should treat Decimal128 -0 length as 0", + ), + # Decimal128 trailing zeros and scientific notation resolve correctly. + SubstrBytesTest( + "coerce_decimal_trailing_zeros", + string="hello", + byte_index=Decimal128("3.00"), + byte_count=2, + expected="lo", + msg="$substrBytes should resolve Decimal128 with trailing zeros", + ), + SubstrBytesTest( + "coerce_decimal_sci_notation", + string="hello", + byte_index=Decimal128("3E0"), + byte_count=2, + expected="lo", + msg="$substrBytes should resolve Decimal128 in scientific notation", + ), + SubstrBytesTest( + "coerce_decimal_sci_neg_exp", + string="hello", + byte_index=Decimal128("30E-1"), + byte_count=2, + expected="lo", + msg="$substrBytes should resolve Decimal128 with negative exponent", + ), + # Tiny Decimal128 rounds to 0. + SubstrBytesTest( + "coerce_decimal_tiny_start", + string="hello", + byte_index=DECIMAL128_SMALL_EXPONENT, + byte_count=5, + expected="hello", + msg="$substrBytes should round tiny Decimal128 start to 0", + ), + SubstrBytesTest( + "coerce_decimal_tiny_length", + string="hello", + byte_index=0, + byte_count=DECIMAL128_SMALL_EXPONENT, + expected="", + msg="$substrBytes should round tiny Decimal128 length to 0", + ), + # Double min subnormal and negative min subnormal truncate to 0. + SubstrBytesTest( + "coerce_double_subnormal_start", + string="hello", + byte_index=DOUBLE_MIN_SUBNORMAL, + byte_count=5, + expected="hello", + msg="$substrBytes should truncate subnormal double start to 0", + ), + SubstrBytesTest( + "coerce_double_neg_subnormal_start", + string="hello", + byte_index=DOUBLE_MIN_NEGATIVE_SUBNORMAL, + byte_count=5, + expected="hello", + msg="$substrBytes should truncate negative subnormal double start to 0", + ), + SubstrBytesTest( + "coerce_double_subnormal_length", + string="hello", + byte_index=0, + byte_count=DOUBLE_MIN_SUBNORMAL, + expected="", + msg="$substrBytes should truncate subnormal double length to 0", + ), +] + +# Property [Fractional Negative Boundary for Index - Success]: fractional doubles between -1 +# exclusive and 0 exclusive truncate to 0 and succeed as byte_index, and Decimal128 "-0.5" rounds +# to 0 (even) and succeeds. +SUBSTRBYTES_FRAC_NEG_IDX_SUCCESS_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "frac_neg_idx_double_minus_0_5", + string="hello", + byte_index=-0.5, + byte_count=5, + expected="hello", + msg="$substrBytes should truncate -0.5 start to 0", + ), + SubstrBytesTest( + "frac_neg_idx_double_minus_0_9", + string="hello", + byte_index=-0.9, + byte_count=5, + expected="hello", + msg="$substrBytes should truncate -0.9 start to 0", + ), + SubstrBytesTest( + "frac_neg_idx_decimal_minus_0_5", + string="hello", + byte_index=DECIMAL128_NEGATIVE_HALF, + byte_count=5, + expected="hello", + msg="$substrBytes should round Decimal128 -0.5 start to 0 (even)", + ), +] + +# Property [Fractional Negative Boundary for Count - Success]: fractional doubles between -1 +# exclusive and 0 exclusive truncate to 0 and produce an empty string, and Decimal128 "-0.5" rounds +# to 0 (even) and produces an empty string. +SUBSTRBYTES_FRAC_NEG_CNT_SUCCESS_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "frac_neg_cnt_double_minus_0_5", + string="hello", + byte_index=0, + byte_count=-0.5, + expected="", + msg="$substrBytes should truncate -0.5 length to 0", + ), + SubstrBytesTest( + "frac_neg_cnt_double_minus_0_9", + string="hello", + byte_index=0, + byte_count=-0.9, + expected="", + msg="$substrBytes should truncate -0.9 length to 0", + ), + SubstrBytesTest( + "frac_neg_cnt_decimal_minus_0_5", + string="hello", + byte_index=0, + byte_count=DECIMAL128_NEGATIVE_HALF, + expected="", + msg="$substrBytes should round Decimal128 -0.5 length to 0 (even)", + ), +] + +# Property [Fractional Negative Boundary for Count - Negative]: fractional values that round or +# truncate to -1 or below behave as negative byte_count and return the rest of the string. +SUBSTRBYTES_FRAC_NEG_CNT_NEG_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "frac_neg_cnt_double_minus_1_0", + string="hello", + byte_index=0, + byte_count=-1.0, + expected="hello", + msg="$substrBytes should treat -1.0 length as negative and return rest of string", + ), + SubstrBytesTest( + "frac_neg_cnt_decimal_minus_1_5", + string="hello", + byte_index=0, + byte_count=DECIMAL128_NEGATIVE_ONE_AND_HALF, + expected="hello", + msg="$substrBytes should round Decimal128 -1.5 length to -2 and return rest of string", + ), +] + +# Property [Decimal128 Special Values in byte_count]: Decimal128 NaN, Infinity, -Infinity, and +# values that overflow int64 range coerce to int64 min for byte_count, which is negative, so they +# return the rest of the string from the start position. +SUBSTRBYTES_DECIMAL128_SPECIAL_COUNT_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "dec_special_cnt_nan", + string="hello", + byte_index=0, + byte_count=Decimal128("NaN"), + expected="hello", + msg="$substrBytes should treat Decimal128 NaN length as negative", + ), + SubstrBytesTest( + "dec_special_cnt_inf", + string="hello", + byte_index=0, + byte_count=DECIMAL128_INFINITY, + expected="hello", + msg="$substrBytes should treat Decimal128 Infinity length as negative", + ), + SubstrBytesTest( + "dec_special_cnt_neg_inf", + string="hello", + byte_index=0, + byte_count=DECIMAL128_NEGATIVE_INFINITY, + expected="hello", + msg="$substrBytes should treat Decimal128 -Infinity length as negative", + ), + SubstrBytesTest( + "dec_special_cnt_nan_from_middle", + string="hello", + byte_index=2, + byte_count=Decimal128("NaN"), + expected="llo", + msg="$substrBytes should return rest from middle with Decimal128 NaN length", + ), + SubstrBytesTest( + "dec_special_cnt_overflow_1e6144", + string="hello", + byte_index=0, + byte_count=DECIMAL128_LARGE_EXPONENT, + expected="hello", + msg="$substrBytes should treat Decimal128 1E+6144 length as negative overflow", + ), + SubstrBytesTest( + "dec_special_cnt_overflow_max", + string="hello", + byte_index=0, + byte_count=Decimal128("9999999999999999999999999999999999E+6111"), + expected="hello", + msg="$substrBytes should treat Decimal128 max value length as negative overflow", + ), + SubstrBytesTest( + "dec_special_cnt_overflow_34_digit", + string="hello", + byte_index=0, + byte_count=Decimal128("9999999999999999999999999999999999"), + expected="hello", + msg="$substrBytes should treat Decimal128 34-digit length as negative overflow", + ), +] + + +SUBSTRBYTES_NUMERIC_COERCION_ALL_TESTS = ( + SUBSTRBYTES_NUMERIC_COERCION_TESTS + + SUBSTRBYTES_FRAC_NEG_IDX_SUCCESS_TESTS + + SUBSTRBYTES_FRAC_NEG_CNT_SUCCESS_TESTS + + SUBSTRBYTES_FRAC_NEG_CNT_NEG_TESTS + + SUBSTRBYTES_DECIMAL128_SPECIAL_COUNT_TESTS +) + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_NUMERIC_COERCION_ALL_TESTS)) +def test_substrbytes_numeric_coercion(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_size_limit.py new file mode 100644 index 00000000..14673e5b --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_size_limit.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [String Size Limit Success]: input strings just under the size limit are accepted. +SUBSTRBYTES_SIZE_LIMIT_SUCCESS_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "size_one_under_first", + string="a" * (STRING_SIZE_LIMIT_BYTES - 1), + byte_index=0, + byte_count=1, + expected="a", + msg="$substrBytes should extract first byte from string one byte under the 16 MB limit", + ), + SubstrBytesTest( + "size_one_under_full", + string="a" * (STRING_SIZE_LIMIT_BYTES - 1), + byte_index=0, + byte_count=STRING_SIZE_LIMIT_BYTES - 1, + expected="a" * (STRING_SIZE_LIMIT_BYTES - 1), + msg="$substrBytes should extract full string one byte under the 16 MB limit", + ), + SubstrBytesTest( + "size_one_under_last", + string="a" * (STRING_SIZE_LIMIT_BYTES - 1), + byte_index=STRING_SIZE_LIMIT_BYTES - 2, + byte_count=1, + expected="a", + msg="$substrBytes should extract last byte from string one byte under the 16 MB limit", + ), +] + + +# Property [String Size Limit]: input strings at or above the size limit produce an error. +SUBSTRBYTES_SIZE_LIMIT_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "size_at_limit", + string="a" * STRING_SIZE_LIMIT_BYTES, + byte_index=0, + byte_count=1, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$substrBytes should reject input string at the 16 MB byte limit", + ), + # 2-byte chars exceeding 16 MB in bytes. + SubstrBytesTest( + "size_multibyte_at_limit", + string="é" * (STRING_SIZE_LIMIT_BYTES // 2), + byte_index=0, + byte_count=1, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$substrBytes should reject multi-byte string exceeding 16 MB in bytes", + ), +] + + +SUBSTRBYTES_SIZE_LIMIT_ALL_TESTS = ( + SUBSTRBYTES_SIZE_LIMIT_SUCCESS_TESTS + SUBSTRBYTES_SIZE_LIMIT_ERROR_TESTS +) + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_SIZE_LIMIT_ALL_TESTS)) +def test_substrbytes_size_limit(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_string_coercion.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_string_coercion.py new file mode 100644 index 00000000..7a28244b --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_string_coercion.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import pytest +from bson import Code, Decimal128, Int64, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_MAX_NEGATIVE, + DECIMAL128_MIN_POSITIVE, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_TRAILING_ZERO, + DOUBLE_MAX_SAFE_INTEGER, + DOUBLE_NEGATIVE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT64_MAX, + INT64_MIN, +) + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [String Parameter Coercion]: non-string types for the string parameter are coerced to +# their string representation before extraction. +SUBSTRBYTES_COERCION_TESTS: list[SubstrBytesTest] = [ + # int32 and int64 coerce to decimal string representation. + SubstrBytesTest( + "coerce_int32", + string=42, + byte_index=0, + byte_count=-1, + expected="42", + msg="$substrBytes should coerce int32 to string", + ), + SubstrBytesTest( + "coerce_int32_negative", + string=-42, + byte_index=0, + byte_count=-1, + expected="-42", + msg="$substrBytes should coerce negative int32 to string", + ), + SubstrBytesTest( + "coerce_int32_zero", + string=0, + byte_index=0, + byte_count=-1, + expected="0", + msg="$substrBytes should coerce int32 zero to string", + ), + SubstrBytesTest( + "coerce_int64", + string=Int64(42), + byte_index=0, + byte_count=-1, + expected="42", + msg="$substrBytes should coerce int64 to string", + ), + SubstrBytesTest( + "coerce_int64_max", + string=INT64_MAX, + byte_index=0, + byte_count=-1, + expected="9223372036854775807", + msg="$substrBytes should coerce INT64_MAX to string", + ), + SubstrBytesTest( + "coerce_int64_min", + string=INT64_MIN, + byte_index=0, + byte_count=-1, + expected="-9223372036854775808", + msg="$substrBytes should coerce INT64_MIN to string", + ), + # double whole numbers omit trailing .0. + SubstrBytesTest( + "coerce_double_whole", + string=3.0, + byte_index=0, + byte_count=-1, + expected="3", + msg="$substrBytes should coerce whole double without trailing .0", + ), + SubstrBytesTest( + "coerce_double_whole_100", + string=100.0, + byte_index=0, + byte_count=-1, + expected="100", + msg="$substrBytes should coerce double 100 without trailing .0", + ), + # double with fractional part. + SubstrBytesTest( + "coerce_double_fractional", + string=3.14, + byte_index=0, + byte_count=-1, + expected="3.14", + msg="$substrBytes should coerce fractional double to string", + ), + # double negative zero. + SubstrBytesTest( + "coerce_double_neg_zero", + string=DOUBLE_NEGATIVE_ZERO, + byte_index=0, + byte_count=-1, + expected="-0", + msg="$substrBytes should coerce double negative zero", + ), + # double special values are lowercase. + SubstrBytesTest( + "coerce_double_nan", + string=FLOAT_NAN, + byte_index=0, + byte_count=-1, + expected="nan", + msg="$substrBytes should coerce double NaN to lowercase nan", + ), + SubstrBytesTest( + "coerce_double_inf", + string=FLOAT_INFINITY, + byte_index=0, + byte_count=-1, + expected="inf", + msg="$substrBytes should coerce double Infinity to lowercase inf", + ), + SubstrBytesTest( + "coerce_double_neg_inf", + string=FLOAT_NEGATIVE_INFINITY, + byte_index=0, + byte_count=-1, + expected="-inf", + msg="$substrBytes should coerce double -Infinity to lowercase -inf", + ), + # double uses scientific notation for large and small magnitudes. + SubstrBytesTest( + "coerce_double_sci_large", + string=1e6, + byte_index=0, + byte_count=-1, + expected="1e+06", + msg="$substrBytes should coerce large double to scientific notation", + ), + SubstrBytesTest( + "coerce_double_sci_small", + string=1e-10, + byte_index=0, + byte_count=-1, + expected="1e-10", + msg="$substrBytes should coerce small double to scientific notation", + ), + SubstrBytesTest( + "coerce_double_precision_limit", + string=float(DOUBLE_MAX_SAFE_INTEGER), + byte_index=0, + byte_count=-1, + expected="9.0072e+15", + msg="$substrBytes should coerce double at precision limit", + ), + # Decimal128 preserves exact string representation including trailing zeros. + SubstrBytesTest( + "coerce_decimal_trailing", + string=DECIMAL128_TRAILING_ZERO, + byte_index=0, + byte_count=-1, + expected="1.0", + msg="$substrBytes should preserve Decimal128 trailing zeros", + ), + SubstrBytesTest( + "coerce_decimal_neg_zero", + string=Decimal128("-0.0"), + byte_index=0, + byte_count=-1, + expected="-0.0", + msg="$substrBytes should preserve Decimal128 negative zero", + ), + SubstrBytesTest( + "coerce_decimal_sci", + string=Decimal128("1.23E+10"), + byte_index=0, + byte_count=-1, + expected="1.23E+10", + msg="$substrBytes should preserve Decimal128 scientific notation", + ), + # Decimal128 preserves full 34-digit precision. + SubstrBytesTest( + "coerce_decimal_34_digits", + string=Decimal128("1234567890123456789012345678901234"), + byte_index=0, + byte_count=-1, + expected="1234567890123456789012345678901234", + msg="$substrBytes should preserve Decimal128 34-digit precision", + ), + SubstrBytesTest( + "coerce_decimal_min_positive", + string=DECIMAL128_MIN_POSITIVE, + byte_index=0, + byte_count=-1, + expected="1E-6176", + msg="$substrBytes should coerce Decimal128 minimum positive value", + ), + SubstrBytesTest( + "coerce_decimal_max_negative", + string=DECIMAL128_MAX_NEGATIVE, + byte_index=0, + byte_count=-1, + expected="-1E-6176", + msg="$substrBytes should coerce Decimal128 maximum negative value", + ), + # Decimal128 special values are capitalized (unlike double's lowercase). + SubstrBytesTest( + "coerce_decimal_nan", + string=Decimal128("NaN"), + byte_index=0, + byte_count=-1, + expected="NaN", + msg="$substrBytes should coerce Decimal128 NaN with capitalization", + ), + SubstrBytesTest( + "coerce_decimal_infinity", + string=DECIMAL128_INFINITY, + byte_index=0, + byte_count=-1, + expected="Infinity", + msg="$substrBytes should coerce Decimal128 Infinity with capitalization", + ), + SubstrBytesTest( + "coerce_decimal_neg_infinity", + string=DECIMAL128_NEGATIVE_INFINITY, + byte_index=0, + byte_count=-1, + expected="-Infinity", + msg="$substrBytes should coerce Decimal128 -Infinity with capitalization", + ), + # datetime coerces to ISO 8601 format. + SubstrBytesTest( + "coerce_datetime", + string={"$toDate": "2024-01-15T10:30:45.123Z"}, + byte_index=0, + byte_count=-1, + expected="2024-01-15T10:30:45.123Z", + msg="$substrBytes should coerce datetime to ISO 8601", + ), + SubstrBytesTest( + "coerce_datetime_pre_epoch", + string={"$toDate": "1969-07-20T20:17:40.000Z"}, + byte_index=0, + byte_count=-1, + expected="1969-07-20T20:17:40.000Z", + msg="$substrBytes should coerce pre-epoch datetime to ISO 8601", + ), + # Timestamp coerces to "Mon DD HH:MM:SS:increment" with double-space padding for single-digit + # days. + SubstrBytesTest( + "coerce_timestamp_single_digit_day", + string=Timestamp(1704067200, 1), + byte_index=0, + byte_count=-1, + expected="Jan 1 00:00:00:1", + msg="$substrBytes should coerce Timestamp with single-digit day", + ), + SubstrBytesTest( + "coerce_timestamp_double_digit_day", + string=Timestamp(1721500800, 1), + byte_index=0, + byte_count=-1, + expected="Jul 20 18:40:00:1", + msg="$substrBytes should coerce Timestamp with double-digit day", + ), + SubstrBytesTest( + "coerce_timestamp_increment", + string=Timestamp(1704067200, 42), + byte_index=0, + byte_count=-1, + expected="Jan 1 00:00:00:42", + msg="$substrBytes should coerce Timestamp preserving increment", + ), + # Code (without scope) coerces to its code string. + SubstrBytesTest( + "coerce_code", + string=Code("function() { return 1; }"), + byte_index=0, + byte_count=-1, + expected="function() { return 1; }", + msg="$substrBytes should coerce Code to its code string", + ), +] + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_COERCION_TESTS)) +def test_substrbytes_string_coercion(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes string coercion cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_type_errors.py new file mode 100644 index 00000000..006f2f6a --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_type_errors.py @@ -0,0 +1,492 @@ +from __future__ import annotations + +import pytest +from bson import Binary, Code, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_TO_STRING_CONVERSION_ERROR, + SUBSTR_LENGTH_TYPE_ERROR, + SUBSTR_START_TYPE_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import MISSING + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [String Parameter Type Strictness]: non-coercible types for the string parameter produce +# an error, including arrays, objects, and expressions evaluating to rejected types. +SUBSTRBYTES_STRING_TYPE_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "string_type_bool", + string=True, + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject boolean string parameter", + ), + SubstrBytesTest( + "string_type_empty_array", + string=[], + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject empty array string parameter", + ), + SubstrBytesTest( + "string_type_single_array", + string=["a"], + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject single-element array string parameter", + ), + SubstrBytesTest( + "string_type_nested_array", + string=[["a"]], + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject nested array string parameter", + ), + SubstrBytesTest( + "string_type_array_with_null", + string=[None], + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject array with null string parameter", + ), + SubstrBytesTest( + "string_type_object", + string={"a": 1}, + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject object string parameter", + ), + SubstrBytesTest( + "string_type_objectid", + string=ObjectId(), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject ObjectId string parameter", + ), + SubstrBytesTest( + "string_type_binary", + string=Binary(b"data"), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject Binary string parameter", + ), + SubstrBytesTest( + "string_type_binary_uuid", + string=Binary(b"\x00" * 16, 4), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject Binary UUID string parameter", + ), + SubstrBytesTest( + "string_type_regex", + string=Regex("pattern"), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject Regex string parameter", + ), + SubstrBytesTest( + "string_type_code_with_scope", + string=Code("x", {"a": 1}), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject Code with scope string parameter", + ), + SubstrBytesTest( + "string_type_minkey", + string=MinKey(), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject MinKey string parameter", + ), + SubstrBytesTest( + "string_type_maxkey", + string=MaxKey(), + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject MaxKey string parameter", + ), + # Expressions evaluating to rejected types. + SubstrBytesTest( + "string_type_expr_bool", + string={"$gt": [1, 0]}, + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject expression evaluating to boolean", + ), + SubstrBytesTest( + "string_type_expr_array", + string={"$literal": ["a"]}, + byte_index=0, + byte_count=1, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrBytes should reject expression evaluating to array", + ), +] + +# Property [Index Type Strictness]: non-numeric types for byte_index produce an error, including +# expressions evaluating to non-numeric types. +SUBSTRBYTES_INDEX_TYPE_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "index_type_null", + string="hello", + byte_index=None, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject null start index", + ), + SubstrBytesTest( + "index_type_missing", + string="hello", + byte_index=MISSING, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject missing start index", + ), + SubstrBytesTest( + "index_type_string", + string="hello", + byte_index="0", + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject string start index", + ), + SubstrBytesTest( + "index_type_bool", + string="hello", + byte_index=True, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject boolean start index", + ), + SubstrBytesTest( + "index_type_array", + string="hello", + byte_index=[], + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject array start index", + ), + SubstrBytesTest( + "index_type_object", + string="hello", + byte_index={"a": 1}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject object start index", + ), + SubstrBytesTest( + "index_type_objectid", + string="hello", + byte_index=ObjectId(), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject ObjectId start index", + ), + SubstrBytesTest( + "index_type_datetime", + string="hello", + byte_index={"$toDate": "2024-01-01T00:00:00Z"}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject datetime start index", + ), + SubstrBytesTest( + "index_type_timestamp", + string="hello", + byte_index=Timestamp(0, 1), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Timestamp start index", + ), + SubstrBytesTest( + "index_type_binary", + string="hello", + byte_index=Binary(b"data"), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Binary start index", + ), + SubstrBytesTest( + "index_type_binary_uuid", + string="hello", + byte_index=Binary(b"\x00" * 16, 4), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Binary UUID start index", + ), + SubstrBytesTest( + "index_type_regex", + string="hello", + byte_index=Regex("x"), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Regex start index", + ), + SubstrBytesTest( + "index_type_code", + string="hello", + byte_index=Code("x"), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Code start index", + ), + SubstrBytesTest( + "index_type_code_with_scope", + string="hello", + byte_index=Code("x", {"a": 1}), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject Code with scope start index", + ), + SubstrBytesTest( + "index_type_minkey", + string="hello", + byte_index=MinKey(), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject MinKey start index", + ), + SubstrBytesTest( + "index_type_maxkey", + string="hello", + byte_index=MaxKey(), + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject MaxKey start index", + ), + # Expressions evaluating to non-numeric types. + SubstrBytesTest( + "index_type_expr_null", + string="hello", + byte_index={"$literal": None}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to null start", + ), + SubstrBytesTest( + "index_type_expr_string", + string="hello", + byte_index={"$concat": ["0"]}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to string start", + ), + SubstrBytesTest( + "index_type_expr_array", + string="hello", + byte_index={"$literal": [0]}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to array start", + ), + SubstrBytesTest( + "index_type_expr_cond_null", + string="hello", + byte_index={"$cond": [True, None, 0]}, + byte_count=3, + error_code=SUBSTR_START_TYPE_ERROR, + msg="$substrBytes should reject $cond evaluating to null start", + ), +] + +# Property [Count Type Strictness]: non-numeric types for byte_count produce an error, including +# expressions evaluating to non-numeric types. +SUBSTRBYTES_COUNT_TYPE_ERROR_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "count_type_null", + string="hello", + byte_index=0, + byte_count=None, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject null byte count", + ), + SubstrBytesTest( + "count_type_missing", + string="hello", + byte_index=0, + byte_count=MISSING, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject missing byte count", + ), + SubstrBytesTest( + "count_type_string", + string="hello", + byte_index=0, + byte_count="3", + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject string byte count", + ), + SubstrBytesTest( + "count_type_bool", + string="hello", + byte_index=0, + byte_count=True, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject boolean byte count", + ), + SubstrBytesTest( + "count_type_array", + string="hello", + byte_index=0, + byte_count=[], + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject array byte count", + ), + SubstrBytesTest( + "count_type_object", + string="hello", + byte_index=0, + byte_count={"a": 1}, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject object byte count", + ), + SubstrBytesTest( + "count_type_objectid", + string="hello", + byte_index=0, + byte_count=ObjectId(), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject ObjectId byte count", + ), + SubstrBytesTest( + "count_type_datetime", + string="hello", + byte_index=0, + byte_count={"$toDate": "2024-01-01T00:00:00Z"}, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject datetime byte count", + ), + SubstrBytesTest( + "count_type_timestamp", + string="hello", + byte_index=0, + byte_count=Timestamp(0, 1), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Timestamp byte count", + ), + SubstrBytesTest( + "count_type_binary", + string="hello", + byte_index=0, + byte_count=Binary(b"data"), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Binary byte count", + ), + SubstrBytesTest( + "count_type_binary_uuid", + string="hello", + byte_index=0, + byte_count=Binary(b"\x00" * 16, 4), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Binary UUID byte count", + ), + SubstrBytesTest( + "count_type_regex", + string="hello", + byte_index=0, + byte_count=Regex("x"), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Regex byte count", + ), + SubstrBytesTest( + "count_type_code", + string="hello", + byte_index=0, + byte_count=Code("x"), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Code byte count", + ), + SubstrBytesTest( + "count_type_code_with_scope", + string="hello", + byte_index=0, + byte_count=Code("x", {"a": 1}), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject Code with scope byte count", + ), + SubstrBytesTest( + "count_type_minkey", + string="hello", + byte_index=0, + byte_count=MinKey(), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject MinKey byte count", + ), + SubstrBytesTest( + "count_type_maxkey", + string="hello", + byte_index=0, + byte_count=MaxKey(), + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject MaxKey byte count", + ), + # Expressions evaluating to non-numeric types. + SubstrBytesTest( + "count_type_expr_null", + string="hello", + byte_index=0, + byte_count={"$literal": None}, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to null count", + ), + SubstrBytesTest( + "count_type_expr_string", + string="hello", + byte_index=0, + byte_count={"$literal": "3"}, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to string count", + ), + SubstrBytesTest( + "count_type_expr_array", + string="hello", + byte_index=0, + byte_count={"$literal": [3]}, + error_code=SUBSTR_LENGTH_TYPE_ERROR, + msg="$substrBytes should reject expression evaluating to array count", + ), +] + + +SUBSTRBYTES_TYPE_ERROR_ALL_TESTS = ( + SUBSTRBYTES_STRING_TYPE_ERROR_TESTS + + SUBSTRBYTES_INDEX_TYPE_ERROR_TESTS + + SUBSTRBYTES_COUNT_TYPE_ERROR_TESTS +) + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_TYPE_ERROR_ALL_TESTS)) +def test_substrbytes_type_errors(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_usage.py new file mode 100644 index 00000000..ff1aad81 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/test_substrBytes_usage.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project_with_insert, +) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_TRAILING_ZERO, + MISSING, +) + +from .utils.substrBytes_common import ( + OPERATORS, + SubstrBytesTest, + _expr, +) + +# Property [Expression Arguments]: all three parameters accept expressions that resolve to valid +# types. +SUBSTRBYTES_EXPRESSION_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "expr_string", + string={"$concat": ["hel", "lo"]}, + byte_index=0, + byte_count=5, + expected="hello", + msg="$substrBytes should accept an expression for the string parameter", + ), + SubstrBytesTest( + "expr_start", + string="hello", + byte_index={"$add": [1, 1]}, + byte_count=3, + expected="llo", + msg="$substrBytes should accept an expression for the start parameter", + ), + SubstrBytesTest( + "expr_length", + string="hello", + byte_index=0, + byte_count={"$strLenBytes": "hel"}, + expected="hel", + msg="$substrBytes should accept an expression for the length parameter", + ), + SubstrBytesTest( + "nested_substrBytes", + string={"$substrBytes": ["hello world", 0, 5]}, + byte_index=1, + byte_count=3, + expected="ell", + msg="$substrBytes should compose with itself (substring of a substring)", + ), +] + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_EXPRESSION_TESTS)) +def test_substrbytes_usage(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes cases.""" + result = execute_expression(collection, _expr(test_case, op)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) + + +# Property [Document Field References]: $substrBytes works with field references from inserted +# documents. +@pytest.mark.parametrize("op", OPERATORS) +def test_substrbytes_document_fields(collection, op): + """Test $substrBytes reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "hello world", "i": 6, "n": 5}, + {"result": {op: ["$s", "$i", "$n"]}}, + ) + assertSuccess( + result, [{"result": "world"}], msg="$substrBytes should read values from document fields" + ) + + +# Property [Return Type]: the result is always type "string" when the expression succeeds, +# including when the string parameter is null, missing, or coerced from a non-string type. +SUBSTRBYTES_RETURN_TYPE_TESTS: list[SubstrBytesTest] = [ + SubstrBytesTest( + "return_type_normal", + string="hello", + byte_index=0, + byte_count=5, + msg="$substrBytes should return type string for normal input", + ), + SubstrBytesTest( + "return_type_null", + string=None, + byte_index=0, + byte_count=5, + msg="$substrBytes should return type string for null input", + ), + SubstrBytesTest( + "return_type_missing", + string=MISSING, + byte_index=0, + byte_count=5, + msg="$substrBytes should return type string for missing input", + ), + SubstrBytesTest( + "return_type_coerced_int", + string=42, + byte_index=0, + byte_count=2, + msg="$substrBytes should return type string for coerced int", + ), + SubstrBytesTest( + "return_type_coerced_double", + string=3.14, + byte_index=0, + byte_count=4, + msg="$substrBytes should return type string for coerced double", + ), + SubstrBytesTest( + "return_type_coerced_decimal", + string=DECIMAL128_TRAILING_ZERO, + byte_index=0, + byte_count=3, + msg="$substrBytes should return type string for coerced Decimal128", + ), + SubstrBytesTest( + "return_type_empty", + string="", + byte_index=0, + byte_count=0, + msg="$substrBytes should return type string for empty input", + ), +] + + +@pytest.mark.parametrize("op", OPERATORS) +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRBYTES_RETURN_TYPE_TESTS)) +def test_substrbytes_return_type(collection, op, test_case: SubstrBytesTest): + """Test $substrBytes result is always type string.""" + result = execute_expression( + collection, + {"$type": {op: [test_case.string, test_case.byte_index, test_case.byte_count]}}, + ) + assertSuccess(result, [{"result": "string"}], msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/utils/substrBytes_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/utils/substrBytes_common.py new file mode 100644 index 00000000..be54114a --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrBytes/utils/substrBytes_common.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import pytest + +from documentdb_tests.framework.test_case import BaseTestCase + +from ...substr.test_operator_substr import ( + SUBSTR_OPERATOR, +) + +# Sentinel for "omit this parameter from the expression." Distinct from None +# (which means pass null). +_OMIT = object() + + +@dataclass(frozen=True) +class SubstrBytesTest(BaseTestCase): + """Test case for $substrBytes operator.""" + + string: Any = None + byte_index: Any = None + byte_count: Any = None + raw_args: Any = _OMIT # Raw operator argument override for arity tests. + + +# $substr is a deprecated alias for $substrBytes; both are tested here. +OPERATORS = [ + pytest.param("$substrBytes", id="substrBytes"), + SUBSTR_OPERATOR, +] + + +def _expr(test_case: SubstrBytesTest, op: str) -> dict: + if test_case.raw_args is not _OMIT: + return {op: test_case.raw_args} + return {op: [test_case.string, test_case.byte_index, test_case.byte_count]} diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_core.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_core.py new file mode 100644 index 00000000..531ee9ce --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_core.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import INT32_MAX + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Core Substring]: extraction uses zero-based code point indexing with count specifying +# the number of code points to extract. +SUBSTRCP_CORE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "core_basic_ascii", + string="abcde", + index=1, + count=2, + expected="bc", + msg="$substrCP should extract from zero-based code point index", + ), + SubstrCPTest( + "core_digit_string", + string="12345", + index=0, + count=3, + expected="123", + msg="$substrCP should treat digit characters as a string, not a number", + ), + SubstrCPTest( + "core_expression_params", + string="hello", + index={"$add": [0, 1]}, + count={"$subtract": [3, 1]}, + expected="el", + msg="$substrCP should accept expressions for index and count", + ), + SubstrCPTest( + "core_expression_string", + string={"$concat": ["hel", "lo"]}, + index=1, + count=3, + expected="ell", + msg="$substrCP should accept expression for string parameter", + ), + SubstrCPTest( + "core_dollar_literal", + string={"$literal": "$hello"}, + index=0, + count=6, + expected="$hello", + msg="$substrCP should handle dollar-prefixed string via $literal", + ), +] + +# Property [Boundary Clamping]: when index or count exceeds the string length, the result is +# clamped without error. +SUBSTRCP_BOUNDARY_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "boundary_count_exceeds", + string="hello", + index=3, + count=10, + expected="lo", + msg="$substrCP should clamp when index + count exceeds string length", + ), + SubstrCPTest( + "boundary_index_at_length", + string="hello", + index=5, + count=1, + expected="", + msg="$substrCP should return empty string when index equals string length", + ), + SubstrCPTest( + "boundary_int32_max_index", + string="hello", + index=INT32_MAX, + count=1, + expected="", + msg="$substrCP should return empty string for INT32_MAX index on short string", + ), + SubstrCPTest( + "boundary_int32_max_count", + string="hello", + index=0, + count=INT32_MAX, + expected="hello", + msg="$substrCP should clamp INT32_MAX count to remaining characters", + ), + SubstrCPTest( + "boundary_both_int32_max", + string="hello", + index=INT32_MAX, + count=INT32_MAX, + expected="", + msg="$substrCP should handle both index and count at INT32_MAX without overflow", + ), + SubstrCPTest( + "boundary_count_zero", + string="hello", + index=2, + count=0, + expected="", + msg="$substrCP should return empty string when count is 0", + ), + SubstrCPTest( + "boundary_empty_string", + string="", + index=0, + count=0, + expected="", + msg="$substrCP should return empty string for empty input with index 0 count 0", + ), + SubstrCPTest( + "boundary_empty_string_count_1", + string="", + index=0, + count=1, + expected="", + msg="$substrCP should return empty string for empty input with count > 0", + ), +] + + +SUBSTRCP_CORE_ALL_TESTS = SUBSTRCP_CORE_TESTS + SUBSTRCP_BOUNDARY_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_CORE_ALL_TESTS)) +def test_substrcp_core(collection, test_case: SubstrCPTest): + """Test $substrCP core substring and boundary cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_encoding.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_encoding.py new file mode 100644 index 00000000..2d4bee95 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_encoding.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Encoding]: multi-byte UTF-8 characters, combining marks, zero-width characters, and +# special code points are each counted as one code point. +SUBSTRCP_ENCODING_TESTS: list[SubstrCPTest] = [ + # U+00E9 é is 2 bytes in UTF-8 but one code point. + SubstrCPTest( + "encoding_2byte", + string="café", + index=3, + count=1, + expected="é", + msg="$substrCP should count 2-byte UTF-8 character as one code point", + ), + # CJK characters are 3 bytes in UTF-8. + SubstrCPTest( + "encoding_3byte", + string="日本語", + index=1, + count=1, + expected="本", + msg="$substrCP should count 3-byte CJK character as one code point", + ), + # Emoji are 4 bytes in UTF-8. + SubstrCPTest( + "encoding_4byte", + string="🎉🚀✨", + index=1, + count=1, + expected="🚀", + msg="$substrCP should count 4-byte emoji as one code point", + ), + # Precomposed U+00E9 is a single code point, unlike decomposed e + U+0301. + SubstrCPTest( + "encoding_precomposed", + string="\u00e9X", + index=0, + count=1, + expected="\u00e9", + msg="$substrCP should treat precomposed character as one code point", + ), + # U+FEFF BOM is one code point. + SubstrCPTest( + "encoding_bom", + string="\ufeffhello", + index=0, + count=1, + expected="\ufeff", + msg="$substrCP should count BOM as one code point", + ), + # Embedded null byte U+0000 is one code point. + SubstrCPTest( + "encoding_null_byte", + string="a\x00b", + index=1, + count=1, + expected="\x00", + msg="$substrCP should count embedded null byte as one code point", + ), + # Mixed whitespace: space, tab, newline, carriage return, space. + SubstrCPTest( + "encoding_whitespace_mix", + string=" \t\n\r ", + index=2, + count=1, + expected="\n", + msg="$substrCP should correctly index across different whitespace characters", + ), + # Control character U+0001. + SubstrCPTest( + "encoding_control_char", + string="\x01\x1f", + index=0, + count=1, + expected="\x01", + msg="$substrCP should count control characters as one code point each", + ), + # U+200B Zero-Width Space is one code point. + SubstrCPTest( + "encoding_zwsp", + string="a\u200bb", + index=1, + count=1, + expected="\u200b", + msg="$substrCP should count ZWSP as one code point", + ), + # U+200F Right-to-Left Mark is one code point. + SubstrCPTest( + "encoding_rtl_mark", + string="a\u200fb", + index=1, + count=1, + expected="\u200f", + msg="$substrCP should count directional mark as one code point", + ), + # U+00A0 NBSP, U+2002 en space, U+2003 em space are each one code point. + SubstrCPTest( + "encoding_nbsp", + string="\u00a0X", + index=0, + count=1, + expected="\u00a0", + msg="$substrCP should count NBSP as one code point", + ), + SubstrCPTest( + "encoding_en_em_space", + string="\u2002\u2003", + index=1, + count=1, + expected="\u2003", + msg="$substrCP should count en/em space as one code point each", + ), + # U+D7FF (last code point before surrogate range). + SubstrCPTest( + "encoding_boundary_d7ff", + string="\ud7ff", + index=0, + count=1, + expected="\ud7ff", + msg="$substrCP should handle U+D7FF correctly", + ), + # U+E000 (first private use area character). + SubstrCPTest( + "encoding_boundary_e000", + string="\ue000", + index=0, + count=1, + expected="\ue000", + msg="$substrCP should handle U+E000 correctly", + ), + # U+FFFF (last BMP character). + SubstrCPTest( + "encoding_boundary_ffff", + string="\uffff", + index=0, + count=1, + expected="\uffff", + msg="$substrCP should handle U+FFFF correctly", + ), + # U+10000 (first supplementary plane character). + SubstrCPTest( + "encoding_boundary_10000", + string="\U00010000", + index=0, + count=1, + expected="\U00010000", + msg="$substrCP should handle U+10000 correctly", + ), + # U+10FFFF (last valid Unicode code point). + SubstrCPTest( + "encoding_boundary_10ffff", + string="\U0010ffff", + index=0, + count=1, + expected="\U0010ffff", + msg="$substrCP should handle U+10FFFF correctly", + ), + SubstrCPTest( + "encoding_json_chars", + string='{"key": [1]}', + index=0, + count=5, + expected='{"key', + msg="$substrCP should treat JSON/BSON special characters as regular characters", + ), + SubstrCPTest( + "encoding_backslash", + string="a\\b\\c", + index=1, + count=3, + expected="\\b\\", + msg="$substrCP should treat backslash as a regular character", + ), +] + +# Property [Grapheme Splitting]: the operator splits at code point boundaries, not grapheme cluster +# boundaries, so combining marks and ZWJ emoji components are extracted independently. +SUBSTRCP_GRAPHEME_SPLIT_TESTS: list[SubstrCPTest] = [ + # Decomposed e + U+0301 combining acute accent = 2 code points. + SubstrCPTest( + "grapheme_base_without_combining", + string="e\u0301", + index=0, + count=1, + expected="e", + msg="$substrCP should extract base character without combining mark", + ), + SubstrCPTest( + "grapheme_combining_mark_alone", + string="e\u0301", + index=1, + count=1, + expected="\u0301", + msg="$substrCP should extract combining mark alone", + ), + # ZWJ family emoji: 👨 + ZWJ + 👩 + ZWJ + 👧 + ZWJ + 👦 = 7 code points. + SubstrCPTest( + "grapheme_zwj_emoji_first", + string="👨\u200d👩\u200d👧\u200d👦", + index=0, + count=1, + expected="👨", + msg="$substrCP should extract first emoji from ZWJ sequence", + ), + SubstrCPTest( + "grapheme_zwj_emoji_joiner", + string="👨\u200d👩\u200d👧\u200d👦", + index=1, + count=1, + expected="\u200d", + msg="$substrCP should extract ZWJ joiner from emoji sequence", + ), + SubstrCPTest( + "grapheme_zwj_emoji_partial", + string="👨\u200d👩\u200d👧\u200d👦", + index=0, + count=3, + expected="👨\u200d👩", + msg="$substrCP should split ZWJ emoji sequence at code point boundaries", + ), +] + + +SUBSTRCP_ENCODING_ALL_TESTS = SUBSTRCP_ENCODING_TESTS + SUBSTRCP_GRAPHEME_SPLIT_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_ENCODING_ALL_TESTS)) +def test_substrcp_encoding(collection, test_case: SubstrCPTest): + """Test $substrCP encoding and grapheme splitting cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_invalid_args.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_invalid_args.py new file mode 100644 index 00000000..bb6ac3fe --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_invalid_args.py @@ -0,0 +1,530 @@ +from __future__ import annotations + +import pytest +from bson import Decimal128, Int64 + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_TO_STRING_CONVERSION_ERROR, + EXPRESSION_TYPE_MISMATCH_ERROR, + FAILED_TO_PARSE_ERROR, + FIELD_PATH_NULL_BYTE_ERROR, + INVALID_DOLLAR_FIELD_PATH, + SUBSTRCP_COUNT_NEGATIVE_ERROR, + SUBSTRCP_COUNT_NON_INT_ERROR, + SUBSTRCP_COUNT_TYPE_ERROR, + SUBSTRCP_INDEX_NEGATIVE_ERROR, + SUBSTRCP_INDEX_NON_INT_ERROR, + SUBSTRCP_INDEX_TYPE_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_NEGATIVE_INFINITY, + DECIMAL128_ONE_AND_HALF, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_OVERFLOW, + INT32_UNDERFLOW, + INT64_MAX, + INT64_MIN, +) + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Arity]: $substrCP requires exactly 3 arguments in an array. +SUBSTRCP_ARITY_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "arity_zero", + raw_args=[], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject 0 arguments", + ), + SubstrCPTest( + "arity_one", + raw_args=["hello"], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject 1 argument", + ), + SubstrCPTest( + "arity_two", + raw_args=["hello", 0], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject 2 arguments", + ), + SubstrCPTest( + "arity_four", + raw_args=["hello", 0, 1, 2], + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject 4 arguments", + ), + SubstrCPTest( + "arity_bare_value", + raw_args="hello", + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject a bare value instead of an array", + ), + SubstrCPTest( + "arity_bare_null", + raw_args=None, + error_code=EXPRESSION_TYPE_MISMATCH_ERROR, + msg="$substrCP should reject bare null argument", + ), +] + + +# Property [Field Path Parsing]: invalid field path syntax produces parse-time errors. +SUBSTRCP_FIELD_PATH_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "fieldpath_bare_dollar", + string="$", + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrCP should reject bare '$' as invalid field path", + ), + SubstrCPTest( + "fieldpath_double_dollar", + string="$$", + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrCP should reject '$$' as empty variable name", + ), + SubstrCPTest( + "fieldpath_null_byte", + string="$a\x00b", + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrCP should reject field path containing null byte", + ), + SubstrCPTest( + "fieldpath_bare_dollar_index", + string="hello", + index="$", + count=1, + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrCP should reject bare '$' as index", + ), + SubstrCPTest( + "fieldpath_double_dollar_index", + string="hello", + index="$$", + count=1, + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrCP should reject '$$' as index", + ), + SubstrCPTest( + "fieldpath_null_byte_index", + string="hello", + index="$a\x00b", + count=1, + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrCP should reject field path with null byte as index", + ), + SubstrCPTest( + "fieldpath_bare_dollar_count", + string="hello", + index=0, + count="$", + error_code=INVALID_DOLLAR_FIELD_PATH, + msg="$substrCP should reject bare '$' as count", + ), + SubstrCPTest( + "fieldpath_double_dollar_count", + string="hello", + index=0, + count="$$", + error_code=FAILED_TO_PARSE_ERROR, + msg="$substrCP should reject '$$' as count", + ), + SubstrCPTest( + "fieldpath_null_byte_count", + string="hello", + index=0, + count="$a\x00b", + error_code=FIELD_PATH_NULL_BYTE_ERROR, + msg="$substrCP should reject field path with null byte as count", + ), +] + + +# Property [Non-Integer Index]: fractional, NaN, infinity, and out-of-range numeric values for the +# index produce an error. +SUBSTRCP_INDEX_NON_INT_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "nonint_index_frac_double", + string="hello", + index=1.5, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject fractional double as index", + ), + SubstrCPTest( + "nonint_index_nan", + string="hello", + index=FLOAT_NAN, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject NaN double as index", + ), + SubstrCPTest( + "nonint_index_inf", + string="hello", + index=FLOAT_INFINITY, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject positive infinity as index", + ), + SubstrCPTest( + "nonint_index_neg_inf", + string="hello", + index=FLOAT_NEGATIVE_INFINITY, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject negative infinity as index", + ), + SubstrCPTest( + "nonint_index_large_double", + string="hello", + index=1e20, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject large double outside int32 range as index", + ), + SubstrCPTest( + "nonint_index_int64_overflow", + string="hello", + index=Int64(INT32_OVERFLOW), + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Int64 above int32 range as index", + ), + SubstrCPTest( + "nonint_index_int64_max", + string="hello", + index=INT64_MAX, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Int64 max as index", + ), + SubstrCPTest( + "nonint_index_int64_min", + string="hello", + index=INT64_MIN, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Int64 min as index", + ), + SubstrCPTest( + "nonint_index_frac_decimal128", + string="hello", + index=DECIMAL128_ONE_AND_HALF, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject fractional Decimal128 as index", + ), + SubstrCPTest( + "nonint_index_decimal128_nan", + string="hello", + index=Decimal128("NaN"), + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 NaN as index", + ), + SubstrCPTest( + "nonint_index_decimal128_inf", + string="hello", + index=DECIMAL128_INFINITY, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 Infinity as index", + ), + SubstrCPTest( + "nonint_index_decimal128_neg_inf", + string="hello", + index=DECIMAL128_NEGATIVE_INFINITY, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 negative Infinity as index", + ), + SubstrCPTest( + "nonint_index_decimal128_out_of_range", + string="hello", + index=Decimal128("3000000000"), + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject out-of-range Decimal128 as index", + ), + # 0.9999999999999999 displays as "1" but is not exactly representable as an integer. + SubstrCPTest( + "nonint_index_nearly_int", + string="hello", + index=0.9999999999999999, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject double that displays as integer but is not exact", + ), +] + + +# Property [Non-Integer Count]: fractional, NaN, infinity, and out-of-range numeric values for the +# count produce an error. +SUBSTRCP_COUNT_NON_INT_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "nonint_count_frac_double", + string="hello", + index=0, + count=1.5, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject fractional double as count", + ), + SubstrCPTest( + "nonint_count_nan", + string="hello", + index=0, + count=FLOAT_NAN, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject NaN double as count", + ), + SubstrCPTest( + "nonint_count_inf", + string="hello", + index=0, + count=FLOAT_INFINITY, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject positive infinity as count", + ), + SubstrCPTest( + "nonint_count_neg_inf", + string="hello", + index=0, + count=FLOAT_NEGATIVE_INFINITY, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject negative infinity as count", + ), + SubstrCPTest( + "nonint_count_large_double", + string="hello", + index=0, + count=1e20, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject large double outside int32 range as count", + ), + SubstrCPTest( + "nonint_count_int64_overflow", + string="hello", + index=0, + count=Int64(INT32_OVERFLOW), + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Int64 above int32 range as count", + ), + SubstrCPTest( + "nonint_count_int64_max", + string="hello", + index=0, + count=INT64_MAX, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Int64 max as count", + ), + SubstrCPTest( + "nonint_count_int64_min", + string="hello", + index=0, + count=INT64_MIN, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Int64 min as count", + ), + SubstrCPTest( + "nonint_count_frac_decimal128", + string="hello", + index=0, + count=DECIMAL128_ONE_AND_HALF, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject fractional Decimal128 as count", + ), + SubstrCPTest( + "nonint_count_decimal128_nan", + string="hello", + index=0, + count=Decimal128("NaN"), + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 NaN as count", + ), + SubstrCPTest( + "nonint_count_decimal128_inf", + string="hello", + index=0, + count=DECIMAL128_INFINITY, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 Infinity as count", + ), + SubstrCPTest( + "nonint_count_decimal128_neg_inf", + string="hello", + index=0, + count=DECIMAL128_NEGATIVE_INFINITY, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject Decimal128 negative Infinity as count", + ), + SubstrCPTest( + "nonint_count_decimal128_out_of_range", + string="hello", + index=0, + count=Decimal128("3000000000"), + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject out-of-range Decimal128 as count", + ), + # 3.0000000000000004 displays as "3" but is not exactly representable as an integer. + SubstrCPTest( + "nonint_count_nearly_int", + string="hello", + index=0, + count=3.0000000000000004, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject double that displays as integer but is not exact", + ), + SubstrCPTest( + "nonint_count_expr_imprecision", + string="hello", + index=0, + count={"$add": [0.1, 0.2]}, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject expression result with floating-point imprecision", + ), +] + + +# Property [Negative Index]: negative int32 and Int64 values within int32 range produce an error. +SUBSTRCP_INDEX_NEGATIVE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "neg_index_int32", + string="hello", + index=-1, + error_code=SUBSTRCP_INDEX_NEGATIVE_ERROR, + msg="$substrCP should reject negative int32 index", + ), + SubstrCPTest( + "neg_index_int64", + string="hello", + index=Int64(-1), + error_code=SUBSTRCP_INDEX_NEGATIVE_ERROR, + msg="$substrCP should reject negative Int64 index within int32 range", + ), + # Out-of-range negative fires the non-int check before the negative check. + SubstrCPTest( + "neg_index_out_of_range", + string="hello", + index=Int64(INT32_UNDERFLOW), + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP should reject out-of-range negative index with non-int error", + ), +] + + +# Property [Negative Count]: negative int32 and Int64 values within int32 range produce an error. +SUBSTRCP_COUNT_NEGATIVE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "neg_count_int32", + string="hello", + index=0, + count=-1, + error_code=SUBSTRCP_COUNT_NEGATIVE_ERROR, + msg="$substrCP should reject negative int32 count", + ), + SubstrCPTest( + "neg_count_int64", + string="hello", + index=0, + count=Int64(-1), + error_code=SUBSTRCP_COUNT_NEGATIVE_ERROR, + msg="$substrCP should reject negative Int64 count within int32 range", + ), + # Out-of-range negative fires the non-int check before the negative check. + SubstrCPTest( + "neg_count_out_of_range", + string="hello", + index=0, + count=Int64(INT32_UNDERFLOW), + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP should reject out-of-range negative count with non-int error", + ), +] + + +# Property [Error Precedence]: errors are evaluated in priority order: string type before index +# type before index non-integer before count type before count non-integer before count negative +# before index negative. +SUBSTRCP_ERROR_PRECEDENCE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "prec_string_over_index_type", + string=True, + index="bad", + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP string type error should take precedence over index type error", + ), + SubstrCPTest( + "prec_string_over_count_type", + string=True, + index=0, + count="bad", + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP string type error should take precedence over count type error", + ), + SubstrCPTest( + "prec_index_type_over_count_nonint", + string="hello", + index="bad", + count=1.5, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP index type error should take precedence over count non-integer", + ), + SubstrCPTest( + "prec_index_nonint_over_count_type", + string="hello", + index=1.5, + count="bad", + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP index non-integer should take precedence over count type error", + ), + SubstrCPTest( + "prec_index_nonint_over_count_nonint", + string="hello", + index=1.5, + count=1.5, + error_code=SUBSTRCP_INDEX_NON_INT_ERROR, + msg="$substrCP index non-integer should take precedence over count non-integer", + ), + SubstrCPTest( + "prec_count_type_over_index_neg", + string="hello", + index=-1, + count="bad", + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP count type error should take precedence over index negative", + ), + SubstrCPTest( + "prec_count_nonint_over_index_neg", + string="hello", + index=-1, + count=1.5, + error_code=SUBSTRCP_COUNT_NON_INT_ERROR, + msg="$substrCP count non-integer should take precedence over index negative", + ), + SubstrCPTest( + "prec_count_neg_over_index_neg", + string="hello", + index=-1, + count=-1, + error_code=SUBSTRCP_COUNT_NEGATIVE_ERROR, + msg="$substrCP count negative should take precedence over index negative", + ), +] + + +SUBSTRCP_INVALID_ARGS_ALL_TESTS = ( + SUBSTRCP_ARITY_TESTS + + SUBSTRCP_FIELD_PATH_TESTS + + SUBSTRCP_INDEX_NON_INT_TESTS + + SUBSTRCP_COUNT_NON_INT_TESTS + + SUBSTRCP_INDEX_NEGATIVE_TESTS + + SUBSTRCP_COUNT_NEGATIVE_TESTS + + SUBSTRCP_ERROR_PRECEDENCE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_INVALID_ARGS_ALL_TESTS)) +def test_substrcp_invalid_args(collection, test_case: SubstrCPTest): + """Test $substrCP invalid argument cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_null.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_null.py new file mode 100644 index 00000000..ec20af1c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_null.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import MISSING + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Null String]: when the string expression is null or missing, the result is "". +SUBSTRCP_NULL_STRING_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "null_string", + string=None, + index=0, + count=1, + expected="", + msg="$substrCP should return empty string when string is null", + ), + SubstrCPTest( + "missing_string", + string=MISSING, + index=0, + count=1, + expected="", + msg="$substrCP should return empty string when string is missing", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_NULL_STRING_TESTS)) +def test_substrcp_null(collection, test_case: SubstrCPTest): + """Test $substrCP null string cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_numeric_coercion.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_numeric_coercion.py new file mode 100644 index 00000000..cdd51889 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_numeric_coercion.py @@ -0,0 +1,255 @@ +from __future__ import annotations + +import pytest +from bson import Decimal128, Int64 + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import DECIMAL128_NEGATIVE_ZERO, DOUBLE_NEGATIVE_ZERO + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Numeric Type Acceptance]: index and count accept int32, Int64, whole-number doubles, +# Decimal128 whole numbers, and negative zero across all numeric type combinations. +SUBSTRCP_NUMERIC_TYPE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "numtype_double_index", + string="hello", + index=2.0, + count=2, + expected="ll", + msg="$substrCP should accept whole-number double index", + ), + SubstrCPTest( + "numtype_decimal128_index", + string="hello", + index=Decimal128("2"), + count=2, + expected="ll", + msg="$substrCP should accept Decimal128 index", + ), + SubstrCPTest( + "numtype_decimal128_trailing_zero_index", + string="hello", + index=Decimal128("2.0"), + count=2, + expected="ll", + msg="$substrCP should accept Decimal128 with trailing zero as index", + ), + SubstrCPTest( + "numtype_decimal128_scientific_index", + string="hello", + index=Decimal128("20E-1"), + count=2, + expected="ll", + msg="$substrCP should accept Decimal128 in scientific notation as index", + ), + SubstrCPTest( + "numtype_neg_zero_double_index", + string="hello", + index=DOUBLE_NEGATIVE_ZERO, + count=2, + expected="he", + msg="$substrCP should treat double -0.0 as 0 for index", + ), + SubstrCPTest( + "numtype_neg_zero_decimal128_index", + string="hello", + index=DECIMAL128_NEGATIVE_ZERO, + count=2, + expected="he", + msg="$substrCP should treat Decimal128 -0 as 0 for index", + ), + SubstrCPTest( + "numtype_int64_count", + string="hello", + index=0, + count=Int64(2), + expected="he", + msg="$substrCP should accept Int64 count", + ), + SubstrCPTest( + "numtype_double_count", + string="hello", + index=0, + count=2.0, + expected="he", + msg="$substrCP should accept whole-number double count", + ), + SubstrCPTest( + "numtype_decimal128_count", + string="hello", + index=0, + count=Decimal128("2"), + expected="he", + msg="$substrCP should accept Decimal128 count", + ), + SubstrCPTest( + "numtype_decimal128_trailing_zero_count", + string="hello", + index=0, + count=Decimal128("2.0"), + expected="he", + msg="$substrCP should accept Decimal128 with trailing zero as count", + ), + SubstrCPTest( + "numtype_decimal128_scientific_count", + string="hello", + index=0, + count=Decimal128("20E-1"), + expected="he", + msg="$substrCP should accept Decimal128 in scientific notation as count", + ), + SubstrCPTest( + "numtype_neg_zero_double_count", + string="hello", + index=0, + count=DOUBLE_NEGATIVE_ZERO, + expected="", + msg="$substrCP should treat double -0.0 as 0 for count", + ), + SubstrCPTest( + "numtype_neg_zero_decimal128_count", + string="hello", + index=0, + count=DECIMAL128_NEGATIVE_ZERO, + expected="", + msg="$substrCP should treat Decimal128 -0 as 0 for count", + ), + # Cross-product of numeric types for index and count. + SubstrCPTest( + "numtype_int32_int64", + string="hello", + index=1, + count=Int64(2), + expected="el", + msg="$substrCP should accept int32 index with Int64 count", + ), + SubstrCPTest( + "numtype_int32_double", + string="hello", + index=1, + count=2.0, + expected="el", + msg="$substrCP should accept int32 index with double count", + ), + SubstrCPTest( + "numtype_int32_decimal128", + string="hello", + index=1, + count=Decimal128("2"), + expected="el", + msg="$substrCP should accept int32 index with Decimal128 count", + ), + SubstrCPTest( + "numtype_int64_int32", + string="hello", + index=Int64(1), + count=2, + expected="el", + msg="$substrCP should accept Int64 index with int32 count", + ), + SubstrCPTest( + "numtype_int64_int64", + string="hello", + index=Int64(1), + count=Int64(2), + expected="el", + msg="$substrCP should accept Int64 index with Int64 count", + ), + SubstrCPTest( + "numtype_int64_double", + string="hello", + index=Int64(1), + count=2.0, + expected="el", + msg="$substrCP should accept Int64 index with double count", + ), + SubstrCPTest( + "numtype_int64_decimal128", + string="hello", + index=Int64(1), + count=Decimal128("2"), + expected="el", + msg="$substrCP should accept Int64 index with Decimal128 count", + ), + SubstrCPTest( + "numtype_double_int32", + string="hello", + index=1.0, + count=2, + expected="el", + msg="$substrCP should accept double index with int32 count", + ), + SubstrCPTest( + "numtype_double_int64", + string="hello", + index=1.0, + count=Int64(2), + expected="el", + msg="$substrCP should accept double index with Int64 count", + ), + SubstrCPTest( + "numtype_double_double", + string="hello", + index=1.0, + count=2.0, + expected="el", + msg="$substrCP should accept double index with double count", + ), + SubstrCPTest( + "numtype_double_decimal128", + string="hello", + index=1.0, + count=Decimal128("2"), + expected="el", + msg="$substrCP should accept double index with Decimal128 count", + ), + SubstrCPTest( + "numtype_decimal128_int32", + string="hello", + index=Decimal128("1"), + count=2, + expected="el", + msg="$substrCP should accept Decimal128 index with int32 count", + ), + SubstrCPTest( + "numtype_decimal128_int64", + string="hello", + index=Decimal128("1"), + count=Int64(2), + expected="el", + msg="$substrCP should accept Decimal128 index with Int64 count", + ), + SubstrCPTest( + "numtype_decimal128_double", + string="hello", + index=Decimal128("1"), + count=2.0, + expected="el", + msg="$substrCP should accept Decimal128 index with double count", + ), + SubstrCPTest( + "numtype_decimal128_decimal128", + string="hello", + index=Decimal128("1"), + count=Decimal128("2"), + expected="el", + msg="$substrCP should accept Decimal128 for both index and count", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_NUMERIC_TYPE_TESTS)) +def test_substrcp_numeric_coercion(collection, test_case: SubstrCPTest): + """Test $substrCP numeric type acceptance cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_size_limit.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_size_limit.py new file mode 100644 index 00000000..b273d891 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_size_limit.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import STRING_SIZE_LIMIT_ERROR +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import STRING_SIZE_LIMIT_BYTES + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [String Size Limit Success]: input strings just under the size limit are accepted. +SUBSTRCP_SIZE_LIMIT_SUCCESS_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "size_long_string", + string="a" * 10_000, + index=9_998, + count=2, + expected="aa", + msg="$substrCP should extract from a 10000-character string", + ), + SubstrCPTest( + "size_one_under", + string="a" * (STRING_SIZE_LIMIT_BYTES - 1), + index=0, + count=1, + expected="a", + msg="$substrCP should accept input string one byte under the 16 MB limit", + ), +] + + +# Property [String Size Limit]: input strings at or above the size limit produce an error. +SUBSTRCP_SIZE_LIMIT_ERROR_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "size_at_limit", + string="a" * STRING_SIZE_LIMIT_BYTES, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$substrCP should reject input string at the 16 MB byte limit", + ), + # 2-byte chars exceeding 16 MB in bytes. + SubstrCPTest( + "size_multibyte_at_limit", + string="é" * (STRING_SIZE_LIMIT_BYTES // 2), + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$substrCP should reject multi-byte string exceeding 16 MB in bytes", + ), + # Eager size check: untaken $cond branch with 16 MB string still errors. + SubstrCPTest( + "size_cond_untaken_branch", + string={"$cond": [True, "hello", "a" * STRING_SIZE_LIMIT_BYTES]}, + error_code=STRING_SIZE_LIMIT_ERROR, + msg="$substrCP should reject 16 MB string in untaken $cond branch", + ), +] + + +SUBSTRCP_SIZE_LIMIT_ALL_TESTS = SUBSTRCP_SIZE_LIMIT_SUCCESS_TESTS + SUBSTRCP_SIZE_LIMIT_ERROR_TESTS + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_SIZE_LIMIT_ALL_TESTS)) +def test_substrcp_size_limit(collection, test_case: SubstrCPTest): + """Test $substrCP size limit cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_string_coercion.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_string_coercion.py new file mode 100644 index 00000000..4c626d92 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_string_coercion.py @@ -0,0 +1,228 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Decimal128, Int64, Timestamp +from bson.code import Code + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_INFINITY, + DECIMAL128_LARGE_EXPONENT, + DECIMAL128_MAX_NEGATIVE, + DECIMAL128_MIN_POSITIVE, + DOUBLE_NEGATIVE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, +) + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Type Coercion]: non-string types for param 1 coerce to their string representation. +SUBSTRCP_TYPE_COERCION_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "coerce_int32", + string=42, + index=0, + count=2, + expected="42", + msg="$substrCP should coerce int32 to its digit-string representation", + ), + SubstrCPTest( + "coerce_int64", + string=Int64(42), + index=0, + count=2, + expected="42", + msg="$substrCP should coerce Int64 to its digit-string representation", + ), + SubstrCPTest( + "coerce_double_whole", + string=3.0, + index=0, + count=1, + expected="3", + msg="$substrCP should coerce whole-number double without decimal point", + ), + SubstrCPTest( + "coerce_double_fractional", + string=3.14, + index=0, + count=4, + expected="3.14", + msg="$substrCP should coerce fractional double to string", + ), + # 1.23456789 has 9 significant digits but coerces to 6: "1.23457". + SubstrCPTest( + "coerce_double_fractional_6sig", + string=1.23456789, + index=0, + count=7, + expected="1.23457", + msg="$substrCP should coerce fractional double to at most 6 significant digits", + ), + SubstrCPTest( + "coerce_double_fixed_boundary", + string=999_999.0, + index=0, + count=6, + expected="999999", + msg="$substrCP should coerce 999999.0 using fixed notation", + ), + SubstrCPTest( + "coerce_double_scientific_boundary", + string=1_000_000.0, + index=0, + count=5, + expected="1e+06", + msg="$substrCP should coerce 1000000.0 using scientific notation", + ), + SubstrCPTest( + "coerce_double_small_fixed", + string=1e-4, + index=0, + count=6, + expected="0.0001", + msg="$substrCP should coerce 1e-4 using fixed notation", + ), + SubstrCPTest( + "coerce_double_small_scientific", + string=1e-5, + index=0, + count=5, + expected="1e-05", + msg="$substrCP should coerce 1e-5 using scientific notation", + ), + SubstrCPTest( + "coerce_double_nan", + string=FLOAT_NAN, + index=0, + count=3, + expected="nan", + msg="$substrCP should coerce double NaN to 'nan'", + ), + SubstrCPTest( + "coerce_double_inf", + string=FLOAT_INFINITY, + index=0, + count=3, + expected="inf", + msg="$substrCP should coerce double inf to 'inf'", + ), + SubstrCPTest( + "coerce_double_neg_inf", + string=FLOAT_NEGATIVE_INFINITY, + index=0, + count=4, + expected="-inf", + msg="$substrCP should coerce double -inf to '-inf'", + ), + SubstrCPTest( + "coerce_double_neg_zero", + string=DOUBLE_NEGATIVE_ZERO, + index=0, + count=2, + expected="-0", + msg="$substrCP should coerce double -0.0 to '-0'", + ), + SubstrCPTest( + "coerce_decimal128", + string=Decimal128("123.0"), + index=0, + count=5, + expected="123.0", + msg="$substrCP should coerce Decimal128 preserving trailing zeros", + ), + SubstrCPTest( + "coerce_decimal128_nan", + string=Decimal128("NaN"), + index=0, + count=3, + expected="NaN", + msg="$substrCP should coerce Decimal128 NaN to 'NaN'", + ), + SubstrCPTest( + "coerce_decimal128_infinity", + string=DECIMAL128_INFINITY, + index=0, + count=8, + expected="Infinity", + msg="$substrCP should coerce Decimal128 Infinity to 'Infinity'", + ), + SubstrCPTest( + "coerce_decimal128_large_exponent", + string=DECIMAL128_LARGE_EXPONENT, + index=0, + count=41, + expected="1.000000000000000000000000000000000E+6144", + msg="$substrCP should coerce Decimal128 with large exponent to expanded form", + ), + SubstrCPTest( + "coerce_decimal128_min_positive", + string=DECIMAL128_MIN_POSITIVE, + index=0, + count=7, + expected="1E-6176", + msg="$substrCP should coerce Decimal128 minimum positive value", + ), + SubstrCPTest( + "coerce_decimal128_max_negative", + string=DECIMAL128_MAX_NEGATIVE, + index=0, + count=8, + expected="-1E-6176", + msg="$substrCP should coerce Decimal128 maximum negative value", + ), + SubstrCPTest( + "coerce_datetime", + string=datetime(2024, 1, 15, 12, 30, 45, 123_000, tzinfo=timezone.utc), + index=0, + count=24, + expected="2024-01-15T12:30:45.123Z", + msg="$substrCP should coerce datetime to ISO 8601 format", + ), + SubstrCPTest( + "coerce_timestamp", + string=Timestamp(1, 1), + index=0, + count=17, + expected="Jan 1 00:00:01:1", + msg="$substrCP should coerce Timestamp to its string format", + ), + SubstrCPTest( + "coerce_code", + string=Code("function() {}"), + index=0, + count=14, + expected="function() {}", + msg="$substrCP should coerce Code to its code string", + ), + # Timestamp(0,0) as a literal coerces to its string form. As a stored field, the server + # replaces it with the current time, so this only tests the literal case. + SubstrCPTest( + "coerce_timestamp_zero", + string=Timestamp(0, 0), + index=0, + count=17, + expected="Jan 1 00:00:00:0", + msg="$substrCP should coerce Timestamp(0,0) literal to its string format", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_TYPE_COERCION_TESTS)) +def test_substrcp_string_coercion(collection, test_case: SubstrCPTest): + """Test $substrCP string coercion cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_type_errors.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_type_errors.py new file mode 100644 index 00000000..667abe70 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_type_errors.py @@ -0,0 +1,358 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import Binary, MaxKey, MinKey, ObjectId, Regex, Timestamp +from bson.code import Code + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_TO_STRING_CONVERSION_ERROR, + SUBSTRCP_COUNT_TYPE_ERROR, + SUBSTRCP_INDEX_TYPE_ERROR, +) +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import MISSING + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Type Strictness Param 1]: non-coercible types for the string expression produce an +# error. +SUBSTRCP_STRING_TYPE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "type_string_bool", + string=True, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject bool as string expression", + ), + SubstrCPTest( + "type_string_array", + string=["a"], + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject array as string expression", + ), + SubstrCPTest( + "type_string_object", + string={"a": 1}, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject object as string expression", + ), + SubstrCPTest( + "type_string_objectid", + string=ObjectId("507f1f77bcf86cd799439011"), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject ObjectId as string expression", + ), + SubstrCPTest( + "type_string_binary", + string=Binary(b"data"), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject Binary as string expression", + ), + SubstrCPTest( + "type_string_binary_uuid", + string=Binary(b"data", 4), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject Binary UUID as string expression", + ), + SubstrCPTest( + "type_string_regex", + string=Regex("pattern"), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject Regex as string expression", + ), + SubstrCPTest( + "type_string_code_scope", + string=Code("function() {}", {"x": 1}), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject Code with scope as string expression", + ), + SubstrCPTest( + "type_string_minkey", + string=MinKey(), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject MinKey as string expression", + ), + SubstrCPTest( + "type_string_maxkey", + string=MaxKey(), + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject MaxKey as string expression", + ), + SubstrCPTest( + "type_string_expr_array", + string={"$literal": [1, 2]}, + error_code=BSON_TO_STRING_CONVERSION_ERROR, + msg="$substrCP should reject expression that evaluates to array at runtime", + ), +] + +# Property [Type Strictness Param 2]: non-numeric types for the index produce an error. +SUBSTRCP_INDEX_TYPE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "type_index_string", + string="hello", + index="bad", + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject string as index", + ), + SubstrCPTest( + "type_index_bool", + string="hello", + index=True, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject bool as index", + ), + SubstrCPTest( + "type_index_array", + string="hello", + index=[1], + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject array as index", + ), + SubstrCPTest( + "type_index_object", + string="hello", + index={"a": 1}, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject object as index", + ), + SubstrCPTest( + "type_index_objectid", + string="hello", + index=ObjectId("507f1f77bcf86cd799439011"), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject ObjectId as index", + ), + SubstrCPTest( + "type_index_datetime", + string="hello", + index=datetime(2024, 1, 1, tzinfo=timezone.utc), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject datetime as index", + ), + SubstrCPTest( + "type_index_timestamp", + string="hello", + index=Timestamp(1, 1), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject Timestamp as index", + ), + SubstrCPTest( + "type_index_binary", + string="hello", + index=Binary(b"data"), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject Binary as index", + ), + SubstrCPTest( + "type_index_regex", + string="hello", + index=Regex("pattern"), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject Regex as index", + ), + SubstrCPTest( + "type_index_code", + string="hello", + index=Code("function() {}"), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject Code as index", + ), + SubstrCPTest( + "type_index_code_scope", + string="hello", + index=Code("function() {}", {"x": 1}), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject Code with scope as index", + ), + SubstrCPTest( + "type_index_minkey", + string="hello", + index=MinKey(), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject MinKey as index", + ), + SubstrCPTest( + "type_index_maxkey", + string="hello", + index=MaxKey(), + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject MaxKey as index", + ), + SubstrCPTest( + "type_index_null", + string="hello", + index=None, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject null as index", + ), + SubstrCPTest( + "type_index_missing", + string="hello", + index=MISSING, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject missing field as index", + ), + SubstrCPTest( + "type_index_expr_array", + string="hello", + index={"$literal": [1, 2]}, + error_code=SUBSTRCP_INDEX_TYPE_ERROR, + msg="$substrCP should reject expression that evaluates to array as index", + ), +] + +# Property [Type Strictness Param 3]: non-numeric types for the count produce +# SUBSTRCP_COUNT_TYPE_ERROR. +SUBSTRCP_COUNT_TYPE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "type_count_string", + string="hello", + index=0, + count="bad", + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject string as count", + ), + SubstrCPTest( + "type_count_bool", + string="hello", + index=0, + count=True, + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject bool as count", + ), + SubstrCPTest( + "type_count_array", + string="hello", + index=0, + count=[1], + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject array as count", + ), + SubstrCPTest( + "type_count_object", + string="hello", + index=0, + count={"a": 1}, + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject object as count", + ), + SubstrCPTest( + "type_count_objectid", + string="hello", + index=0, + count=ObjectId("507f1f77bcf86cd799439011"), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject ObjectId as count", + ), + SubstrCPTest( + "type_count_datetime", + string="hello", + index=0, + count=datetime(2024, 1, 1, tzinfo=timezone.utc), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject datetime as count", + ), + SubstrCPTest( + "type_count_timestamp", + string="hello", + index=0, + count=Timestamp(1, 1), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject Timestamp as count", + ), + SubstrCPTest( + "type_count_binary", + string="hello", + index=0, + count=Binary(b"data"), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject Binary as count", + ), + SubstrCPTest( + "type_count_regex", + string="hello", + index=0, + count=Regex("pattern"), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject Regex as count", + ), + SubstrCPTest( + "type_count_code", + string="hello", + index=0, + count=Code("function() {}"), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject Code as count", + ), + SubstrCPTest( + "type_count_code_scope", + string="hello", + index=0, + count=Code("function() {}", {"x": 1}), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject Code with scope as count", + ), + SubstrCPTest( + "type_count_minkey", + string="hello", + index=0, + count=MinKey(), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject MinKey as count", + ), + SubstrCPTest( + "type_count_maxkey", + string="hello", + index=0, + count=MaxKey(), + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject MaxKey as count", + ), + SubstrCPTest( + "type_count_null", + string="hello", + index=0, + count=None, + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject null as count", + ), + SubstrCPTest( + "type_count_missing", + string="hello", + index=0, + count=MISSING, + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject missing field as count", + ), + SubstrCPTest( + "type_count_expr_array", + string="hello", + index=0, + count={"$literal": [1, 2]}, + error_code=SUBSTRCP_COUNT_TYPE_ERROR, + msg="$substrCP should reject expression that evaluates to array as count", + ), +] + + +SUBSTRCP_TYPE_ERROR_ALL_TESTS = ( + SUBSTRCP_STRING_TYPE_TESTS + SUBSTRCP_INDEX_TYPE_TESTS + SUBSTRCP_COUNT_TYPE_TESTS +) + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_TYPE_ERROR_ALL_TESTS)) +def test_substrcp_type_errors(collection, test_case: SubstrCPTest): + """Test $substrCP type error cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_usage.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_usage.py new file mode 100644 index 00000000..ad66b619 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/test_substrCP_usage.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.expressions.utils.utils import ( + execute_expression, + execute_project_with_insert, +) +from documentdb_tests.framework.assertions import assertResult, assertSuccess +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import MISSING + +from .utils.substrCP_common import SubstrCPTest, _expr + +# Property [Expression Arguments]: all three parameters accept expressions that resolve to valid +# types. +SUBSTRCP_EXPRESSION_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "expr_string", + string={"$concat": ["hel", "lo"]}, + index=0, + count=5, + expected="hello", + msg="$substrCP should accept an expression for the string parameter", + ), + SubstrCPTest( + "expr_index", + string="hello", + index={"$add": [1, 1]}, + count=3, + expected="llo", + msg="$substrCP should accept an expression for the index parameter", + ), + SubstrCPTest( + "expr_count", + string="hello", + index=0, + count={"$strLenCP": "hel"}, + expected="hel", + msg="$substrCP should accept an expression for the count parameter", + ), + SubstrCPTest( + "nested_substrCP", + string={"$substrCP": ["hello world", 0, 5]}, + index=1, + count=3, + expected="ell", + msg="$substrCP should compose with itself (substring of a substring)", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_EXPRESSION_TESTS)) +def test_substrcp_expression_cases(collection, test_case: SubstrCPTest): + """Test $substrCP expression argument cases.""" + result = execute_expression(collection, _expr(test_case)) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) + + +# Property [Document Field References]: $substrCP works with field references from inserted +# documents. +def test_substrcp_document_fields(collection): + """Test $substrCP reads values from document fields.""" + result = execute_project_with_insert( + collection, + {"s": "hello world", "i": 6, "c": 5}, + {"result": {"$substrCP": ["$s", "$i", "$c"]}}, + ) + assertSuccess( + result, + [{"result": "world"}], + msg="$substrCP should resolve field references from document", + ) + + +# Property [Return Type]: $substrCP always returns a BSON string, including when the input was +# coerced from a non-string type or when the result is empty. +SUBSTRCP_RETURN_TYPE_TESTS: list[SubstrCPTest] = [ + SubstrCPTest( + "return_type_string", + string="hello", + index=0, + count=3, + msg="$substrCP of string input should return type string", + ), + SubstrCPTest( + "return_type_coerced", + string=42, + index=0, + count=2, + msg="$substrCP of coerced input should return type string", + ), + SubstrCPTest( + "return_type_null_input", + string=None, + index=0, + count=1, + msg="$substrCP of null input should return type string", + ), + SubstrCPTest( + "return_type_missing_input", + string=MISSING, + index=0, + count=1, + msg="$substrCP of missing input should return type string", + ), + SubstrCPTest( + "return_type_empty", + string="hello", + index=0, + count=0, + msg="$substrCP with count 0 should return type string", + ), +] + + +@pytest.mark.parametrize("test_case", pytest_params(SUBSTRCP_RETURN_TYPE_TESTS)) +def test_substrcp_return_type(collection, test_case: SubstrCPTest): + """Test $substrCP result is always type string.""" + result = execute_expression( + collection, {"$type": {"$substrCP": [test_case.string, test_case.index, test_case.count]}} + ) + assertSuccess(result, [{"result": "string"}], msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/utils/substrCP_common.py b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/utils/substrCP_common.py new file mode 100644 index 00000000..77bd826c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/expressions/string/substrCP/utils/substrCP_common.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from documentdb_tests.framework.test_case import BaseTestCase + +# Sentinel for "omit this parameter from the expression." Distinct from None +# (which means pass null). +_OMIT = object() + + +@dataclass(frozen=True) +class SubstrCPTest(BaseTestCase): + """Test case for $substrCP operator.""" + + string: Any = None + index: Any = 0 + count: Any = 1 + raw_args: Any = _OMIT # Raw operator argument override for arity tests. + + +def _expr(test_case: SubstrCPTest) -> dict: + if test_case.raw_args is not _OMIT: + return {"$substrCP": test_case.raw_args} + return {"$substrCP": [test_case.string, test_case.index, test_case.count]} diff --git a/documentdb_tests/framework/error_codes.py b/documentdb_tests/framework/error_codes.py index c285b5aa..176f6ef8 100644 --- a/documentdb_tests/framework/error_codes.py +++ b/documentdb_tests/framework/error_codes.py @@ -4,11 +4,20 @@ """ DIVIDE_BY_ZERO_ERROR = 2 +FAILED_TO_PARSE_ERROR = 9 TYPE_MISMATCH_ERROR = 14 +BSON_TO_STRING_CONVERSION_ERROR = 16007 EXPRESSION_TYPE_MISMATCH_ERROR = 16020 +SUBSTR_START_TYPE_ERROR = 16034 +SUBSTR_LENGTH_TYPE_ERROR = 16035 +FIELD_PATH_NULL_BYTE_ERROR = 16411 +STRING_SIZE_LIMIT_ERROR = 16493 MODULO_ZERO_REMAINDER_ERROR = 16610 MODULO_NON_NUMERIC_ERROR = 16611 MORE_THAN_ONE_DATE_ERROR = 16612 +INVALID_DOLLAR_FIELD_PATH = 16872 +SUBSTR_CONTINUATION_BYTE_START_ERROR = 28656 +SUBSTR_MID_CHARACTER_END_ERROR = 28657 ABS_OVERFLOW_ERROR = 28680 LOG_NON_NUMERIC_VALUE_ERROR = 28756 LOG_NON_NUMERIC_BASE_ERROR = 28757 @@ -20,4 +29,12 @@ POW_BASE_ZERO_EXP_NEGATIVE_ERROR = 28764 NON_NUMERIC_TYPE_MISMATCH_ERROR = 28765 LN_NON_POSITIVE_INPUT_ERROR = 28766 +OUT_OF_RANGE_CONVERSION_ERROR = 31109 +SUBSTRCP_INDEX_TYPE_ERROR = 34450 +SUBSTRCP_INDEX_NON_INT_ERROR = 34451 +SUBSTRCP_COUNT_TYPE_ERROR = 34452 +SUBSTRCP_COUNT_NON_INT_ERROR = 34453 +SUBSTRCP_COUNT_NEGATIVE_ERROR = 34454 +SUBSTRCP_INDEX_NEGATIVE_ERROR = 34455 +SUBSTR_NEGATIVE_START_ERROR = 50752 MODULO_DECIMAL128_ZERO_REMAINDER_ERROR = 5733415 diff --git a/documentdb_tests/framework/test_constants.py b/documentdb_tests/framework/test_constants.py index 9e3088ed..e4f338a6 100644 --- a/documentdb_tests/framework/test_constants.py +++ b/documentdb_tests/framework/test_constants.py @@ -49,6 +49,8 @@ DECIMAL128_MAX = Decimal128("9.999999999999999999999999999999999E+6144") DECIMAL128_LARGE_EXPONENT = Decimal128("1E+6144") DECIMAL128_SMALL_EXPONENT = Decimal128("1E-6143") +DECIMAL128_MIN_POSITIVE = Decimal128("1E-6176") +DECIMAL128_MAX_NEGATIVE = Decimal128("-1E-6176") DECIMAL128_TRAILING_ZERO = Decimal128("1.0") DECIMAL128_MANY_TRAILING_ZEROS = Decimal128("1.00000000000000000000000000000000") DECIMAL128_NAN = Decimal128("nan") @@ -64,6 +66,7 @@ # Other constant values MISSING = "$missing" +STRING_SIZE_LIMIT_BYTES = 16 * 1024 * 1024 # Int32 lists NUMERIC_INT32_NEGATIVE = [INT32_UNDERFLOW, INT32_MIN]