From 73bf1beae0d21bfaab3a6bb0d2697138d230b3ed Mon Sep 17 00:00:00 2001 From: Tom Reitz Date: Tue, 14 Nov 2023 12:29:07 -0600 Subject: [PATCH 1/2] add optional type-setting for discrete probabilities distribution --- macros/distributions/discrete/probabilities.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/macros/distributions/discrete/probabilities.sql b/macros/distributions/discrete/probabilities.sql index f9da19b..963a053 100644 --- a/macros/distributions/discrete/probabilities.sql +++ b/macros/distributions/discrete/probabilities.sql @@ -1,4 +1,4 @@ -{% macro synth_distribution_discrete_probabilities(probabilities) %} +{% macro synth_distribution_discrete_probabilities(probabilities, type="string") %} {# Set up some variables: #} {%- set epsilon = 0.00001 -%}{# "close enough" to zero #} {%- set ns = namespace(max_prob_digits=1, keys=[], values=[], curr_idx=0, curr_threshold=0.0) -%} @@ -18,9 +18,9 @@ {{ exceptions.raise_compiler_error("`probabilities` must sum to 1.0, not " + ns.values|sum|string) }} {%- endif -%} - {%- if ns.keys[0] is number -%} + {%- if ns.keys[0] is number or type!="string" -%} {% set wrap = "" %} - {% elif ns.keys[0] is string %} + {% elif ns.keys[0] is string or type=="string" %} {% set wrap = "'" %} {% else %} {{ exceptions.raise_compiler_error("`probabilities` keys must be strings or numbers") }} From e2ee1fc9139b6d6da228b7f9fcba66d4b3bf2d18 Mon Sep 17 00:00:00 2001 From: Tom Reitz Date: Thu, 16 Jan 2025 09:53:58 -0600 Subject: [PATCH 2/2] improvements per PR discussion --- README.md | 11 +++++++++-- .../distributions/discrete/probabilities.sql | 18 ++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5fab849..643e087 100644 --- a/README.md +++ b/README.md @@ -226,10 +226,17 @@ Generates discrete values according to a user-defined probability set. * a list (array) such as `[0.05, 0.8, 0.15]`, in which case the (zero-based) indices are the integer values generated * or a dictionary (key-value) structure such as `{ "1":0.05, "3":0.8, "7":0.15 }` with integer keys (specified as strings in order to be valud JSON), in which case the keys are the integers generated -You may actually specify string or float keys in your `probabilities` dict to generate those values instead of integers, however you must specify the additional parameter `keys_type="varchar"` (or similar) so the the value types are correct. For example: +You may actually specify `string`, `number`, or `boolean` keys in your `probabilities` dict to generate those values instead of integers, however you must specify the additional parameter `keys_type="number"` (or similar) so the the value types are correct. For example: ```python - synth_distributions_discrete_probabilities(probabilities={"cat":0.3, "dog":0.5, "parrot":0.2}, keys_type="varchar") + synth_distributions_discrete_probabilities(probabilities={"97":0.3, "85":0.5, "64":0.2}, keys_type="number") ``` +The default `keys_type` is `string`. + +No matter what `keys_type` you choose, you may optionally cast the values to a different type in your database engine with `cast_to`, for example: +```python + synth_distributions_discrete_probabilities(probabilities={"2024-12-31":0.5, "2025-01-01":0.5}, cast_to="date") +``` +(This will compile to something like `case .. when ... then CAST ("2024-12-31" AS date) when ... then CAST ("2025-01-01" AS date) ... end)`.) `probabilities` must sum to `1.0`. diff --git a/macros/distributions/discrete/probabilities.sql b/macros/distributions/discrete/probabilities.sql index 963a053..83d4ab8 100644 --- a/macros/distributions/discrete/probabilities.sql +++ b/macros/distributions/discrete/probabilities.sql @@ -1,4 +1,4 @@ -{% macro synth_distribution_discrete_probabilities(probabilities, type="string") %} +{% macro synth_distribution_discrete_probabilities(probabilities, keys_type="string", cast_to=None) %} {# Set up some variables: #} {%- set epsilon = 0.00001 -%}{# "close enough" to zero #} {%- set ns = namespace(max_prob_digits=1, keys=[], values=[], curr_idx=0, curr_threshold=0.0) -%} @@ -8,6 +8,7 @@ {%- set ns.keys = probabilities.keys()|list -%} {%- set ns.values = probabilities.values()|list -%} {%- elif probabilities is iterable -%}{#- list -#} + {% set keys_type = "number" %} {%- set ns.keys = range(probabilities|length) -%} {%- set ns.values = probabilities -%} {%- else -%} @@ -18,12 +19,12 @@ {{ exceptions.raise_compiler_error("`probabilities` must sum to 1.0, not " + ns.values|sum|string) }} {%- endif -%} - {%- if ns.keys[0] is number or type!="string" -%} + {%- if keys_type in ["number", "boolean"] -%} {% set wrap = "" %} - {% elif ns.keys[0] is string or type=="string" %} + {% elif keys_type=="string" %} {% set wrap = "'" %} {% else %} - {{ exceptions.raise_compiler_error("`probabilities` keys must be strings or numbers") }} + {{ exceptions.raise_compiler_error("`keys_type` must be `string`, `number`, or `boolean`") }} {% endif %} {%- set ns.curr_threshold = ns.values[0] -%} @@ -57,7 +58,9 @@ ) ) }} ], - {{wrap}}{{value_list[value_list|length - 1]}}{{wrap}} + {% if cast_to %}CAST({% endif %} + {{wrap}}{{value_list[value_list|length - 1]}}{{wrap}} + {% if cast_to %} AS {{cast_to}} ){% endif %} ) {% else %} {# Case statement on uniformly-distributed range: #} @@ -67,7 +70,10 @@ {%- set ns.curr_idx = ns.curr_idx + 1 -%} {%- set ns.curr_threshold = ns.curr_threshold + ns.values[ns.curr_idx] -%} {%- endif -%} - when {{i}} then {{wrap}}{{ns.keys[ns.curr_idx]}}{{wrap}} + when {{i}} then + {% if cast_to %}CAST({% endif %} + {{wrap}}{{ns.keys[ns.curr_idx]}}{{wrap}} + {% if cast_to %} AS {{cast_to}} ){% endif %} {% endfor %} end {% endif %}