ETL Postrelease Improvements (#207)

andhreljaKern · web-flow · commit 4f5e0b1f4a77 · 2025-12-15T11:20:10.000+01:00
* fix: add backwards compatibility for integration progress

* chore: remove stale comments

* perf: update dataset

* perf: full_config_hash

* perf(etl): is stale check

* perf: new etl attrs

* perf: etl task new attrs

* perf: create/update new etl attrs

* fix: minor improvements

* chore: conflict resolution

* fix: is_stale

* perf: fetch parsing scope

* perf: only fetch inactive enriched datasets for stale config check

* perf: SET NULL on md_file etl_task deletion

* fix: get all enriched etl tasks join criteria

* perf: adds is_valid_extension check for etl task
diff --git a/cognition_objects/integration.py b/cognition_objects/integration.py
@@ -185,15 +185,27 @@ def get_integration_progress(
 ) -> float:
     integration = get_by_id(integration_id)
     count_all_records = integration_records_bo.count(integration)
-    all_tasks = get_all_etl_tasks(integration_id)
-    finished_tasks = [task for task in all_tasks if task.state in FINISHED_STATES]
 
     if (
         count_all_records == 0
         or integration.state == enums.CognitionMarkdownFileState.FAILED.value
     ):
         return 0.0
-    integration_progress = round((len(finished_tasks) / count_all_records) * 100.0, 2)
+
+    all_tasks = get_all_etl_tasks(integration_id)
+    finished_tasks = [task for task in all_tasks if task.state in FINISHED_STATES]
+    count_finished_tasks = len(finished_tasks)
+
+    # backward compatibility
+    if not all_tasks or len(all_tasks) != count_all_records:
+        all_records, _ = integration_records_bo.get_all_by_integration_id(
+            integration_id
+        )
+        count_finished_tasks += len(
+            [record for record in all_records if not record.etl_task_id]
+        )
+
+    integration_progress = round((count_finished_tasks / count_all_records) * 100.0, 2)
     if integration.state not in FINISHED_STATES:
         integration_progress = min(integration_progress - 1, 0)
     return integration_progress
diff --git a/cognition_objects/markdown_dataset.py b/cognition_objects/markdown_dataset.py
@@ -34,7 +34,7 @@ def __get_enriched_query(
     category_origin: Optional[str] = None,
     query_add: Optional[str] = "",
 ) -> str:
-    where_add = " AND (ed.config_ids->>'isDefault')::bool is true"
+    where_add = ""
     if id:
         id = prevent_sql_injection(id, isinstance(id, str))
         where_add += f" AND md.id = '{id}'"
@@ -46,7 +46,8 @@ def __get_enriched_query(
             md.*, 
             COALESCE(mf.num_files, 0) AS num_files, 
             COALESCE(mf.num_reviewed_files, 0) AS num_reviewed_files, 
-            ecp.etl_config
+            ecp.etl_config,
+            ecp.id as etl_config_id
         FROM cognition.{Tablenames.MARKDOWN_DATASET.value} md
         LEFT JOIN (
             SELECT dataset_id, COUNT(*) as num_files, COUNT(CASE WHEN is_reviewed = TRUE THEN 1 END) AS num_reviewed_files
@@ -56,7 +57,7 @@ def __get_enriched_query(
         LEFT JOIN(
             SELECT md.id, json_array_elements(md.useable_etl_configurations) config_ids 
             FROM cognition.{Tablenames.MARKDOWN_DATASET.value} md
-        ) ed ON ed.id = md.id
+        ) ed ON ed.id = md.id AND (ed.config_ids->>'isDefault')::bool is true
         LEFT JOIN(
             SELECT ecp.id, ecp.etl_config
             FROM cognition.{Tablenames.ETL_CONFIG_PRESET.value} ecp
@@ -177,6 +178,7 @@ def update(
     dataset_id: str,
     name: Optional[str] = None,
     description: Optional[str] = None,
+    useable_etl_configurations: Optional[List[Dict[str, Any]]] = None,
     with_commit: bool = True,
 ) -> CognitionMarkdownDataset:
     dataset = get(org_id, dataset_id)
@@ -187,6 +189,9 @@ def update(
     if description:
         dataset.description = description
 
+    if useable_etl_configurations:
+        dataset.useable_etl_configurations = useable_etl_configurations
+
     general.flush_or_commit(with_commit)
 
     return dataset
diff --git a/cognition_objects/markdown_file.py b/cognition_objects/markdown_file.py
@@ -93,14 +93,20 @@ def __get_enriched_query(
         "etl_task",
         "global",
         prefix=et_prefix,
-        include_columns=["is_active", "error_message"],
+        include_columns=[
+            "started_at",
+            "finished_at",
+            "is_active",
+            "is_stale",
+            "llm_ops",
+            "error_message",
+        ],
     )
 
     query = f"""SELECT
-        {mf_select}, {et_select}, LENGTH({mf_prefix}.content) as content_length,
-        COALESCE({et_prefix}.state, {mf_prefix}.state) state,
-        COALESCE({et_prefix}.started_at, {mf_prefix}.started_at) started_at,
-        COALESCE({et_prefix}.finished_at, {mf_prefix}.finished_at) finished_at
+        {mf_select}, {et_select}, LENGTH({mf_prefix}.content) AS content_length,
+        COALESCE({et_prefix}.state, {mf_prefix}.state) AS state,
+        {et_prefix}.meta_data->>'scope_readable' AS scope_readable
     FROM cognition.markdown_file {mf_prefix}
     LEFT JOIN global.etl_task {et_prefix} ON {mf_prefix}.etl_task_id = {et_prefix}.id
     """
diff --git a/etl_utils.py b/etl_utils.py
@@ -301,17 +301,14 @@ def rm_tree(path: Path):
             if item.is_dir():
                 rm_tree(item)
             else:
-                item.unlink()
+                item.unlink(missing_ok=True)
         path.rmdir()
 
     etl_cache_dir = ETL_DIR / org_id / download_id
     if etl_cache_dir.exists() and etl_cache_dir.is_dir():
         rm_tree(etl_cache_dir)
 
 
-# TODO: delete_etl_tasks for related file_reference_id
-
-
 def get_download_key(org_id: str, download_id: str) -> Path:
     return Path(org_id) / download_id / "download"
 
diff --git a/global_objects/etl_task.py b/global_objects/etl_task.py
diff --git a/models.py b/models.py