SAFEHR-data · myyong · Apr 10, 2026 · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/.github/workflows/tests-mssql.yml b/.github/workflows/tests-mssql.yml
@@ -0,0 +1,64 @@
+---
+name: mssql integration tests
+on:
+  pull_request:
+  workflow_dispatch:
+
+env:
+  PYTHON_VERSION: "3.10"
+  MSSQL_SA_PASSWORD: "Datafaker!Test123"
+
+jobs:
+  mssql-tests:
+    runs-on: ubuntu-latest
+
+    services:
+      mssql:
+        image: mcr.microsoft.com/mssql/server:2022-latest
+        env:
+          ACCEPT_EULA: "Y"
+          MSSQL_SA_PASSWORD: "Datafaker!Test123"
+        ports:
+          - 1433:1433
+        options: >-
+          --health-cmd "/opt/mssql-tools18/bin/sqlcmd -S localhost -U sa -P 'Datafaker!Test123' -Q 'SELECT 1' -No"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 12
+          --health-start-period 30s
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v6
+
+      - name: Install ODBC Driver 18 for SQL Server
+        shell: bash
+        run: |
+          curl -fsSL https://packages.microsoft.com/keys/microsoft.asc \
+            | sudo gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg
+          curl -fsSL "https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list" \
+            | sudo tee /etc/apt/sources.list.d/mssql-release.list
+          sudo apt-get update
+          ACCEPT_EULA=Y sudo apt-get install -y msodbcsql18
+
+      - name: Install poetry
+        shell: bash
+        run: |
+          sudo apt install python3-poetry
+
+      - name: Configure poetry
+        shell: bash
+        run: |
+          python -m poetry config virtualenvs.in-project true
+
+      - name: Install dependencies (with mssql extras)
+        shell: bash
+        run: |
+          python -m poetry install --extras mssql
+
+      - name: Run MS-SQL integration tests
+        shell: bash
+        env:
+          MSSQL_TEST_DSN: "mssql+pyodbc://sa:Datafaker!Test123@localhost:1433/master?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes"
+        run: |
+          poetry run python -m unittest tests.test_functional_mssql -v
diff --git a/datafaker/create.py b/datafaker/create.py
@@ -36,6 +36,22 @@
 serial_re = re.compile(r"\bSERIAL\b")
 
 
+
+@compiles(CreateTable, "mssql")
+def remove_mssql_on_delete_cascade(element: CreateTable, compiler: Any, **kw: Any) -> str:
+    """
+    Strip ON DELETE CASCADE from MS-SQL table DDL.
+
+    MS-SQL rejects multiple cascading FK paths to the same table (error 1785).
+    OMOP-style schemas commonly have many FK columns on one table all pointing at
+    the same vocabulary table, which triggers this limit.  Dropping CASCADE is
+    safe for datafaker because referential integrity is enforced by insert order,
+    not by the database engine.
+    """
+    text: str = compiler.visit_create_table(element, **kw)
+    return text.replace(" ON DELETE CASCADE", "")
+
+
 @compiles(CreateTable, "duckdb")
 def remove_on_delete_cascade(element: CreateTable, compiler: Any, **kw: Any) -> str:
     """

diff --git a/datafaker/db_utils.py b/datafaker/db_utils.py
@@ -38,6 +38,7 @@
     get_ignored_table_names,
     get_vocabulary_table_names,
     logger,
+    make_async_dsn,
     make_foreign_key_name,
 )
 
@@ -139,10 +140,10 @@ def create_db_engine(
     **kwargs: Any,
 ) -> MaybeAsyncEngine:
     """Create a SQLAlchemy Engine."""
+    kwargs.setdefault("pool_pre_ping", True)
     try:
         if use_asyncio:
-            async_dsn = db_dsn.replace("postgresql://", "postgresql+asyncpg://")
-            engine: MaybeAsyncEngine = create_async_engine(async_dsn, **kwargs)
+            engine: MaybeAsyncEngine = create_async_engine(make_async_dsn(db_dsn), **kwargs)
         else:
             engine = create_engine(db_dsn, **kwargs)
     except NoSuchModuleError as exc:
@@ -155,7 +156,10 @@ def create_db_engine(
 
     settings = {}
     if schema_name is not None:
-        settings["search_path"] = schema_name
+        if get_sync_engine(engine).dialect.name == "mssql":
+            engine = engine.execution_options(schema_translate_map={None: schema_name})
+        else:
+            settings["search_path"] = schema_name
     if parquet_dir is not None:
         joined = ",".join(_find_parquet_directories(parquet_dir))
         # double up single quotes

diff --git a/datafaker/interactive/base.py b/datafaker/interactive/base.py
@@ -10,7 +10,7 @@
 
 import sqlalchemy
 from prettytable import PrettyTable
-from sqlalchemy import Engine, ForeignKey, MetaData, Table
+from sqlalchemy import Engine, ForeignKey, MetaData, Table, func, literal_column, or_, select
 from sqlalchemy.exc import DatabaseError, SQLAlchemyError
 from typing_extensions import Self
 
@@ -352,17 +352,13 @@ def do_counts(self, _arg: str) -> None:
             return
         table_name = self.table_name()
         nullable_columns = self.get_nullable_columns(table_name)
-        colcounts = [f', COUNT("{nnc}") AS "{nnc}"' for nnc in nullable_columns]
+        tbl = self.table_metadata()
+        count_exprs = [func.count().label("row_count")] + [
+            func.count(tbl.c[col]).label(col) for col in nullable_columns
+        ]
+        stmt = select(*count_exprs).select_from(tbl)
         with self.sync_engine.connect() as connection:
-            result = (
-                connection.execute(
-                    sqlalchemy.text(
-                        f'SELECT COUNT(*) AS row_count{"".join(colcounts)} FROM "{table_name}"'
-                    )
-                )
-                .mappings()
-                .first()
-            )
+            result = connection.execute(stmt).mappings().first()
             if result is None:
                 self.print("Could not count rows in table {0}", table_name)
                 return
@@ -415,19 +411,23 @@ def do_peek(self, arg: str) -> None:
         col_names = arg.split()
         if not col_names:
             col_names = self._get_column_names()
-        nonnulls = [f'"{cn}" IS NOT NULL' for cn in col_names]
+        random_fn = (
+            func.newid() if self.sync_engine.dialect.name == "mssql" else func.random()
+        )
+        col_exprs = [literal_column(f'"{cn}"') for cn in col_names]
+        nonnull_clauses = [literal_column(f'"{cn}"').isnot(None) for cn in col_names]
+        stmt = (
+            select(*col_exprs)
+            .select_from(self.table_metadata())
+            .where(or_(*nonnull_clauses))
+            .order_by(random_fn)
+            .limit(max_peek_rows)
+        )
         with self.sync_engine.connect() as connection:
-            cols = ", ".join(f'"{cn}"' for cn in col_names)
-            where = "WHERE" if nonnulls else ""
-            nonnull = " OR ".join(nonnulls)
-            query = sqlalchemy.text(
-                f'SELECT {cols} FROM "{table_name}" {where} {nonnull}'
-                f" ORDER BY RANDOM() LIMIT {max_peek_rows}"
-            )
             try:
-                result = connection.execute(query)
+                result = connection.execute(stmt)
             except SQLAlchemyError as exc:
-                self.print(self.ERROR_FAILED_SQL, exc=exc, query=query)
+                self.print(self.ERROR_FAILED_SQL, exc=exc, query=stmt)
                 return
             self.print_table(list(result.keys()), result.fetchmany(max_peek_rows))
 

diff --git a/datafaker/interactive/generators.py b/datafaker/interactive/generators.py
@@ -6,7 +6,7 @@
 from typing import Any, Callable, Optional, cast
 
 import sqlalchemy
-from sqlalchemy import Column
+from sqlalchemy import Column, and_, func, literal_column, select
 
 from datafaker.db_utils import MaybeAsyncEngine, primary_private_fks, table_is_private
 from datafaker.interactive.base import DbCmd, TableEntry, fk_column_name, or_default
@@ -16,6 +16,7 @@
     get_columns_assigned,
     get_row_generators,
     logger,
+    schema_qualified_name,
     split_column_full_name,
 )
 
@@ -61,8 +62,9 @@ def get_aggregate_query(
     ]
     if not clauses:
         return None
+    qualified = schema_qualified_name(table_name, engine)
     alias = f' AS "{table_name}"' if engine.dialect.name == "duckdb" else ""
-    return f'SELECT {", ".join(clauses)} FROM "{table_name}"{alias}'
+    return f'SELECT {", ".join(clauses)} FROM "{qualified}"{alias}'
 
 
 # pylint: disable=too-many-public-methods
@@ -779,15 +781,20 @@ def _get_column_data(
         self, count: int, to_str: Callable[[Any], str] = repr
     ) -> list[list[str]]:
         columns = self._get_column_names()
-        columns_string = ", ".join(columns)
-        pred = " AND ".join(f"{column} IS NOT NULL" for column in columns)
+        random_fn = (
+            func.newid() if self.sync_engine.dialect.name == "mssql" else func.random()
+        )
+        col_exprs = [literal_column(col) for col in columns]
+        nonnull_clauses = [literal_column(col).isnot(None) for col in columns]
+        stmt = (
+            select(*col_exprs)
+            .select_from(self.table_metadata())
+            .where(and_(*nonnull_clauses))
+            .order_by(random_fn)
+            .limit(count)
+        )
         with self.sync_engine.connect() as connection:
-            result = connection.execute(
-                sqlalchemy.text(
-                    f'SELECT {columns_string} FROM "{self.table_name()}"'
-                    f" WHERE {pred} ORDER BY RANDOM() LIMIT {count}"
-                )
-            )
+            result = connection.execute(stmt)
             return [[to_str(x) for x in xs] for xs in result.all()]
 
     def do_propose(self, _arg: str) -> None:

diff --git a/datafaker/interactive/missingness.py b/datafaker/interactive/missingness.py
@@ -23,19 +23,32 @@ class MissingnessType:
     columns: list[str]
 
     @classmethod
-    def sampled_query(cls, table: str, count: int, column_names: Iterable[str]) -> str:
+    def sampled_query(
+        cls,
+        table: str,
+        count: int,
+        column_names: Iterable[str],
+        dialect_name: str = "",
+    ) -> str:
         """
         Construct a query to make a sampling of the named rows of the table.
 
         :param table: The name of the table to sample.
         :param count: The number of samples to get.
         :param column_names: The columns to fetch.
+        :param dialect_name: The SQLAlchemy dialect name (e.g. ``"mssql"``).
         :return: The SQL query to do the sampling.
         """
         result_names = ", ".join([f"{c}__is_null" for c in column_names])
         column_is_nulls = ", ".join(
             [f"{c} IS NULL AS {c}__is_null" for c in column_names]
         )
+        if dialect_name == "mssql":
+            return (
+                f"SELECT COUNT(*) AS row_count, {result_names} FROM "
+                f"(SELECT TOP {count} {column_is_nulls} FROM {table} ORDER BY NEWID())"
+                f" AS __t GROUP BY {result_names}"
+            )
         return cls.SAMPLED_QUERY.format(
             result_names=result_names,
             column_is_nulls=column_is_nulls,
@@ -330,6 +343,7 @@ def do_sampled(self, arg: str) -> None:
                 entry.name,
                 count,
                 self.get_nullable_columns(entry.name),
+                dialect_name=self.sync_engine.dialect.name,
             ),
             [
                 "The missingness patterns and how often they appear in a"

diff --git a/datafaker/interactive/table.py b/datafaker/interactive/table.py
@@ -4,6 +4,7 @@
 from typing import Any, cast
 
 import sqlalchemy
+from sqlalchemy import func, literal_column, select, text
 
 from datafaker.interactive.base import (
     TYPE_LETTER,
@@ -477,16 +478,23 @@ def print_column_data(self, column: str, count: int, min_length: int) -> None:
         :param count: The number of rows to sample.
         :param min_length: The minimum length of text to choose from (0 for any text).
         """
-        where = f"WHERE {column} IS NOT NULL"
+        random_fn = (
+            func.newid() if self.sync_engine.dialect.name == "mssql" else func.random()
+        )
+        col_expr = literal_column(column)
         if 0 < min_length:
-            where = f"WHERE LENGTH({column}) >= {min_length}"
+            where_clause = func.length(col_expr) >= min_length
+        else:
+            where_clause = col_expr.isnot(None)
+        stmt = (
+            select(col_expr)
+            .select_from(self.table_metadata())
+            .where(where_clause)
+            .order_by(random_fn)
+            .limit(count)
+        )
         with self.sync_engine.connect() as connection:
-            result = connection.execute(
-                sqlalchemy.text(
-                    f'SELECT {column} FROM "{self.table_name()}"'
-                    f" {where} ORDER BY RANDOM() LIMIT {count}"
-                )
-            )
+            result = connection.execute(stmt)
             self.columnize([str(x[0]) for x in result.all()])
 
     def print_row_data(self, count: int) -> None:
@@ -495,12 +503,17 @@ def print_row_data(self, count: int) -> None:
 
         :param count: The number of rows to report.
         """
+        random_fn = (
+            func.newid() if self.sync_engine.dialect.name == "mssql" else func.random()
+        )
+        stmt = (
+            select(text("*"))
+            .select_from(self.table_metadata())
+            .order_by(random_fn)
+            .limit(count)
+        )
         with self.sync_engine.connect() as connection:
-            result = connection.execute(
-                sqlalchemy.text(
-                    f'SELECT * FROM "{self.table_name()}" ORDER BY RANDOM() LIMIT {count}'
-                )
-            )
+            result = connection.execute(stmt)
             if result is None:
                 self.print("No rows in this table!")
                 return