CountESS-Project
diff --git a/‎countess/core/pipeline.py‎
Lines changed: 4 additions & 2 deletions b/‎countess/core/pipeline.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎countess/core/plugins.py‎
Lines changed: 71 additions & 93 deletions b/‎countess/core/plugins.py‎
Lines changed: 71 additions & 93 deletions
diff --git a/‎countess/plugins/score.py‎
Lines changed: 27 additions & 7 deletions b/‎countess/plugins/score.py‎
Lines changed: 27 additions & 7 deletions
@@ -85,7 +85,7 @@ def run(self, ddbc):
         assert isinstance(self.plugin, DuckdbPlugin)
         if self.is_dirty:
             sources = {pn.name: pn.run(ddbc) for pn in self.parent_nodes}
-
+            self.plugin.prepare_multi(ddbc, sources)
             result = self.plugin.execute_multi(ddbc, sources)
             if result is not None:
                 try:
@@ -189,7 +189,9 @@ def run(self):
         start_time = time.time()
         for node in self.traverse_nodes():
             node.load_config()
-            result = node.plugin.execute_multi(self.ddbc, {pn.name: pn.result for pn in node.parent_nodes})
+            sources = {pn.name: pn.result for pn in node.parent_nodes}
+            node.plugin.prepare_multi(self.ddbc, sources)
+            result = node.plugin.execute_multi(self.ddbc, sources)
             if result:
                 node.result = duckdb_source_to_view(self.ddbc, result)
             else:
 
@@ -14,20 +14,23 @@
     * Call Plugin.prerun() to generate new output
 """
 
+import decimal
 import glob
 import hashlib
 import importlib
 import importlib.metadata
 import logging
 import multiprocessing
+from multiprocessing.pool import ThreadPool
 from typing import Any, Iterable, Mapping, Optional, Sequence, Tuple, Type, Union
 
 import duckdb
+import psutil
 import pyarrow
 from duckdb import DuckDBPyConnection, DuckDBPyRelation
 
 from countess.core.parameters import BaseParam, FileArrayParam, FileParam, HasSubParametersMixin, MultiParam
-from countess.utils.duckdb import duckdb_concatenate, duckdb_escape_identifier, duckdb_source_to_view
+from countess.utils.duckdb import duckdb_combine, duckdb_concatenate, duckdb_escape_identifier, duckdb_source_to_view
 
 PRERUN_ROW_LIMIT: int = 100000
 
@@ -112,29 +115,26 @@ class DuckdbPlugin(BasePlugin):
     # XXX expand this, or find in library somewhere
     ALLOWED_TYPES = {"INTEGER", "VARCHAR", "FLOAT", "DOUBLE", "DECIMAL"}
 
+    def prepare_multi(self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]) -> None:
+        pass
+
     def execute_multi(
         self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]
     ) -> Optional[DuckDBPyRelation]:
         raise NotImplementedError(f"{self.__class__}.execute_multi")
 
 
 class DuckdbSimplePlugin(DuckdbPlugin):
-    def execute_multi(
-        self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]
-    ) -> Optional[DuckDBPyRelation]:
-        tables = list(sources.values())
-        if len(sources) > 1:
-            source = duckdb_source_to_view(ddbc, duckdb_concatenate(tables))
-        elif len(sources) == 1:
-            source = tables[0]
-        else:
-            source = None
-
-        logger.debug("DuckdbSimplePlugin execute_multi %s", source.alias)
+    def prepare_multi(self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]) -> None:
+        self.prepare(ddbc, duckdb_combine(ddbc, list(sources.values())))
 
+    def prepare(self, ddbc: DuckDBPyConnection, source: Optional[DuckDBPyRelation]) -> None:
         self.set_column_choices([] if source is None else source.columns)
 
-        return self.execute(ddbc, source)
+    def execute_multi(
+        self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]
+    ) -> Optional[DuckDBPyRelation]:
+        return self.execute(ddbc, duckdb_combine(ddbc, list(sources.values())))
 
     def execute(self, ddbc: DuckDBPyConnection, source: Optional[DuckDBPyRelation]) -> Optional[DuckDBPyRelation]:
         raise NotImplementedError(f"{self.__class__}.execute")
@@ -248,94 +248,72 @@ def filter(self, *_) -> bool:
         raise NotImplementedError(f"{self.__class__}.transform")
 
 
-class DuckdbTransformPlugin(DuckdbSimplePlugin):
-    def dropped_columns(self) -> set[str]:
-        return set()
-
-    def output_columns(self) -> dict[str, str]:
-        """Return a dictionary of `column name` -> `dbtype`
-        which will be used to construct the user-defined
-        function.  The columns returned by transform() must
-        match the columns declared here."""
+def _python_type_to_arrow_dtype(ttype: type) -> pyarrow.DataType:
+    if ttype in (float, decimal.Decimal):
+        return pyarrow.float64()
+    elif ttype is int:
+        return pyarrow.int64()
+    elif ttype is bool:
+        return pyarrow.bool8()
+    else:
+        return pyarrow.string()
 
-        raise NotImplementedError(f"{self.__class__}.output_columns")
 
-    def execute(self, ddbc, source):
+class DuckdbTransformPlugin(DuckdbSimplePlugin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.view_name = f"v_{id(self)}"
+
+    def get_reader(self, source):
+        return source.to_arrow_table().to_reader(max_chunksize=2048)
+
+    def remove_fields(self, field_names: list[str]) -> list[str]:
+        return []
+
+    def add_fields(self) -> Mapping[str, type]:
+        return {}
+
+    def fix_schema(self, schema: pyarrow.Schema) -> pyarrow.Schema:
+        logger.debug("DuckdbTransformPlugin.fix_schema in %s", schema.to_string())
+        for field_name in self.remove_fields(schema.names):
+            if field_name in schema.names:
+                schema = schema.remove(schema.get_field_index(field_name))
+        for field_name, ttype in self.add_fields().items():
+            if field_name and ttype is not None:
+                schema = schema.append(pyarrow.field(field_name, _python_type_to_arrow_dtype(ttype)))
+        return schema
+
+    def execute(self, ddbc: DuckDBPyConnection, source: DuckDBPyRelation) -> DuckDBPyRelation:
         """Perform a query which calls `self.transform` for every row."""
 
-        # if you happen to have an output column with the same name as an
-        # input column this drops it, as well as any columns being explicitly
-        # dropped.
-        drop_columns_set = set(list(self.output_columns().keys()) + list(self.dropped_columns()))
+        reader = self.get_reader(source)
+        ddbc.register(self.view_name, pyarrow.Table.from_batches(self.transform_batch(batch) for batch in reader))
+        return ddbc.view(self.view_name)
 
-        # Make up an arbitrary unique name for our temporary function
-        function_name = f"f_{id(self)}"
-
-        # Output type has to be completely defined, with types and all
-        output_type = (
-            "STRUCT("
-            + ",".join(
-                f"{duckdb_escape_identifier(k)} {str(v).upper()}"
-                for k, v in self.output_columns().items()
-                if k is not None and v is not None
-            )
-            + ")"
+    def transform_batch(self, batch: pyarrow.RecordBatch) -> pyarrow.RecordBatch:
+        schema = self.fix_schema(batch.schema)
+        return pyarrow.RecordBatch.from_pylist(
+            [t for t in (self.transform(row) for row in batch.to_pylist()) if t is not None], schema=schema
         )
 
-        # source columns which aren't being dropped get copied into the projection
-        # in their original order, followed by the generated output columns.
-        keep_columns = " ".join(f"{duckdb_escape_identifier(k)}," for k in source.columns if k not in drop_columns_set)
-
-        logger.debug("DuckDbTransformPlugin.query function_name %s", function_name)
-        logger.debug("DuckDbTransformPlugin.query output_type %s", output_type)
-        logger.debug("DuckDbTransformPlugin.query keep_columns %s", keep_columns)
-
-        # if the function already exists, remove it
-        try:
-            ddbc.remove_function(function_name)
-            logger.debug("DuckDbTransformPlugin.query removed function %s", function_name)
-        except duckdb.InvalidInputException as exc:
-            if not str(exc).startswith("Invalid Input Error: No function by the name of '"):
-                # some other error
-                logger.debug("DuckDbTransformPlugin.query can't remove function %s: %s", function_name, exc)
-
-        # XXX it'd be nice to have an an arrow version of this
-        # to allow easy parallelization, but see:
-        # https://github.com/duckdb/duckdb/issues/15626
-        # Appears to be fixed in 1.1.4.dev4815
-
-        ddbc.create_function(
-            name=function_name,
-            function=self.transform_arrow,
-            type="arrow",
-            return_type=output_type,
-            null_handling="special",
-            side_effects=False,
-        )
-
-        # the "SELECT func(_row) FROM {table} _row" bit passes
-        # a whole row to the function, sadly there's no way
-        # to express this in a `.project()`.
-
-        sql_command = f"SELECT {keep_columns} UNNEST({function_name}(_row)) FROM {source.alias} _row"
-        logger.debug("DuckDbTransformPlugin.query sql_command %s", sql_command)
-
-        self.prepare(source)
-
-        return duckdb_source_to_view(ddbc, ddbc.sql(sql_command))
-
-    def prepare(self, source: DuckDBPyRelation):
-        """Called before the transform functions are run, to prepare anything
-        which needs preparation ..."""
-        pass
-
-    def transform_arrow(self, data: pyarrow.array) -> pyarrow.array:
-        logger.debug("DuckDbTransformPlugin.transform_arrow %d", len(data))
-        pool = multiprocessing.Pool(processes=4)
-        return pyarrow.array(pool.imap_unordered(self.transform, data.to_pylist()))
-
     def transform(self, data: dict[str, Any]) -> Union[dict[str, Any], Tuple[Any], None]:
         """This will be called for each row. Return a tuple with the same
         value types as (or a dictionary with the same keys and value types as)
         those nominated by `self.output_columns`, or None to return all NULLs."""
         raise NotImplementedError(f"{self.__class__}.transform")
+
+
+class DuckdbThreadedTransformPlugin(DuckdbTransformPlugin):
+    def execute(self, ddbc: DuckDBPyConnection, source: DuckDBPyRelation) -> DuckDBPyRelation:
+        with multiprocessing.pool.ThreadPool(processes=psutil.cpu_count()) as pool:
+            reader = self.get_reader(source)
+            ddbc.register(self.view_name, pyarrow.Table.from_batches(pool.imap_unordered(self.transform_batch, reader)))
+        return ddbc.view(self.view_name)
+
+
+class DuckdbParallelTransformPlugin(DuckdbTransformPlugin):
+    def execute(self, ddbc: DuckDBPyConnection, source: DuckDBPyRelation) -> DuckDBPyRelation:
+        with multiprocessing.Pool(processes=psutil.cpu_count()) as pool:
+            reader = self.get_reader(source)
+            ddbc.register(self.view_name, pyarrow.Table.from_batches(pool.imap_unordered(self.transform_batch, reader)))
+        return ddbc.view(self.view_name)
@@ -1,3 +1,4 @@
+import logging
 from math import log
 from typing import Any, Optional, Union
 
@@ -13,7 +14,9 @@
     ColumnGroupOrNoneChoiceParam,
     StringParam,
 )
-from countess.core.plugins import DuckdbTransformPlugin
+from countess.core.plugins import DuckdbParallelTransformPlugin
+
+logger = logging.getLogger(__name__)
 
 
 def float_or_none(s: Any) -> Optional[float]:
@@ -37,7 +40,7 @@ def score(xs: list[float], ys: list[float]) -> Optional[tuple[float, float]]:
         return None
 
 
-class ScoringPlugin(DuckdbTransformPlugin):
+class ScoringPlugin(DuckdbParallelTransformPlugin):
     name = "Scoring"
     description = "Score variants using counts or frequencies"
     version = VERSION
@@ -61,7 +64,9 @@ def output_columns(self) -> dict[str, str]:
         else:
             return {self.output.value: "DOUBLE"}
 
-    def prepare(self, source):
+    def prepare(self, ddbc, source):
+        logger.debug("ScoringPlugin.prepare")
+        super().prepare(ddbc, source)
         yaxis_prefix = self.columns.get_column_prefix()
         suffix_set = {k.removeprefix(yaxis_prefix) for k in source.columns if k.startswith(yaxis_prefix)}
 
@@ -70,9 +75,24 @@ def prepare(self, source):
             suffix_set.update([k.removeprefix(xaxis_prefix) for k in source.columns if k.startswith(xaxis_prefix)])
 
         self.suffixes = sorted(suffix_set)
+        logger.debug("ScoringPlugin.prepare suffixes %s", self.suffixes)
+
+    def add_fields(self):
+        return {self.output.value: float, self.variance.value: float}
+
+    def remove_fields(self, field_names: list[str]):
+        if self.drop_input:
+            return [
+                name
+                for name in field_names
+                if name.startswith(self.columns.get_column_prefix())
+                or (self.xaxis.is_not_none() and name.startswith(self.xaxis.get_column_prefix()))
+            ]
+        else:
+            return []
 
     def transform(self, data: dict[str, Any]) -> Optional[dict[str, Any]]:
-        assert self.suffixes
+        assert self.suffixes is not None
 
         if self.xaxis.is_not_none():
             xaxis_prefix = self.xaxis.get_column_prefix()
@@ -99,9 +119,9 @@ def transform(self, data: dict[str, Any]) -> Optional[dict[str, Any]]:
 
         try:
             s, v = score(x_values, y_values)
+            data[self.output.value] = s
             if self.variance:
-                return {self.output.value: s, self.variance.value: v}
-            else:
-                return {self.output.value: s}
+                data[self.variance.value] = v
+            return data
         except TypeError:
             return None