diff --git a/CHANGELOG.md b/CHANGELOG.md index 1928786..f5dd3e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,45 @@ # Changelog +## 1.0.1 + +### Added + +- **`llm_gateway_generate_text()` UDF wrapper for AI-powered DataFrame transformations.** + + New method on proxy providers to generate AI completions in DataFrame operations via a built-in UDF. + + ```python + from datacustomcode import Client + from pyspark.sql.functions import col + + client = Client() + + # Generate summaries in a DataFrame column + df = df.withColumn( + "summary", + client._proxy.llm_gateway_generate_text( + "Summarize {company}: revenue={revenue}, CEO={ceo}", + { + "company": col("company"), + "revenue": col("revenue"), + "ceo": col("ceo") + }, + llmModelId="sfdc_ai__DefaultGPT4Omni", + maxTokens=200 + ) + ) + ``` + + **Local Development:** Returns placeholder string (doesn't execute) + **Production:** Calls a built-in UDF + + **Parameters:** + - `template` (str): Prompt template with {placeholder} syntax + - `values` (dict or Column): Dict mapping placeholders to Columns, or pre-built named_struct + - `llmModelId` (str): Model identifier (required, e.g., "sfdc_ai__DefaultGPT4Omni") + - `maxTokens` (int): Maximum tokens that will be spent on this query + + ## 1.0.0 ### Breaking Changes diff --git a/README.md b/README.md index d3d601a..fe2e226 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ You should only need the following methods: * `write_to_dmo(name, spark_dataframe, write_mode)` – Write to a Data Lake Object by name with a Spark dataframe For example: -``` +```python from datacustomcode import Client client = Client() @@ -166,10 +166,37 @@ sdf = client.read_dlo('my_DLO') client.write_to_dlo('output_DLO') ``` - > [!WARNING] > Currently we only support reading from DMOs and writing to DMOs or reading from DLOs and writing to DLOs, but they cannot mix. +### LLM Gateway + +Generate AI completions in DataFrame transformations using the LLM gateway UDF. + +```python +from datacustomcode import Client +from pyspark.sql.functions import col + +client = Client() + +# Use template with placeholders +df = df.withColumn( + "summary", + client._proxy.llm_gateway_generate_text( + "Summarize {company}: revenue={revenue}, CEO={ceo}", + { + "company": col("company"), + "revenue": col("revenue"), + "ceo": col("ceo") + }, + llmModelId="sfdc_ai__DefaultGPT4Omni", + maxTokens=200 + ) +) +``` + +> [!WARNING] +> This method returns a placeholder string in local development. It only makes a LLM call and spends tokens when deployed, where it calls the real LLM Gateway service via a built-in UDF. ## CLI diff --git a/src/datacustomcode/proxy/client/LocalProxyClientProvider.py b/src/datacustomcode/proxy/client/LocalProxyClientProvider.py index 515db00..9c08b54 100644 --- a/src/datacustomcode/proxy/client/LocalProxyClientProvider.py +++ b/src/datacustomcode/proxy/client/LocalProxyClientProvider.py @@ -27,3 +27,8 @@ def __init__(self, **kwargs: object) -> None: def call_llm_gateway(self, llmModelId: str, prompt: str, maxTokens: int) -> str: return f"Hello, thanks for using {llmModelId}. So many tokens: {maxTokens}" + + def llm_gateway_generate_text( + self, template, values, llmModelId: str, maxTokens: int + ): + return f"Using Generate Text with {llmModelId} and maxTokens: {maxTokens}" diff --git a/src/datacustomcode/proxy/client/base.py b/src/datacustomcode/proxy/client/base.py index 5c840a0..85e304a 100644 --- a/src/datacustomcode/proxy/client/base.py +++ b/src/datacustomcode/proxy/client/base.py @@ -25,3 +25,8 @@ def __init__(self): @abstractmethod def call_llm_gateway(self, llmModelId: str, prompt: str, maxTokens: int) -> str: ... + + @abstractmethod + def llm_gateway_generate_text( + self, template, values, llmModelId: str, maxTokens: int + ): ...