diff --git a/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx b/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx
index b8d8ada1..f967b50e 100644
--- a/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx
+++ b/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx
@@ -39,7 +39,7 @@ export default function TemplateList() {
},
{
key: "delete",
- label: t("dataCleansing.actions.deleteTemplate"),
+ label: t("dataCleansing.actions.delete"),
danger: true,
icon: ,
onClick: deleteTemplate, // implement delete logic
diff --git a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx
index 5484a9e1..55c3891b 100644
--- a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx
+++ b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx
@@ -22,7 +22,7 @@ export default function ChangeLog({ operator }) {
)}
- {release.changelog.map((change, changeIndex) => (
+ {release.changelog?.map((change, changeIndex) => (
-
{change}
diff --git a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx
index c22d1bc7..04dcf210 100644
--- a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx
+++ b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx
@@ -76,14 +76,10 @@ export default function OperatorMarketPage() {
};
const handleDeleteOperator = async (operator: OperatorI) => {
- try {
- await deleteOperatorByIdUsingDelete(operator.id);
- message.success(t("operatorMarket.home.operations.messages.deleteSuccess"));
- fetchData();
- await initCategoriesTree();
- } catch (error) {
- message.error(t("operatorMarket.home.operations.messages.deleteFailed"));
- }
+ await deleteOperatorByIdUsingDelete(operator.id);
+ message.success(t("operatorMarket.home.operations.messages.deleteSuccess"));
+ fetchData();
+ await initCategoriesTree();
};
const handleStar = async (operator: OperatorI) => {
diff --git a/frontend/src/pages/OperatorMarket/operator.const.tsx b/frontend/src/pages/OperatorMarket/operator.const.tsx
index 1104a118..75d99cb0 100644
--- a/frontend/src/pages/OperatorMarket/operator.const.tsx
+++ b/frontend/src/pages/OperatorMarket/operator.const.tsx
@@ -148,10 +148,10 @@ export const mapOperator = (op: OperatorI, t: (key: string) => string) => {
label: t("operatorMarket.const.language"),
value: "Python",
},
- {
- label: t("operatorMarket.const.function"),
- value: functionLabel,
- },
+ // {
+ // label: t("operatorMarket.const.function"),
+ // value: functionLabel,
+ // },
],
};
};
@@ -198,4 +198,4 @@ export const formatBytes = (bytes: number | null | undefined, decimals: number =
// 4. 格式化数值并拼接单位
// parseFloat 用于去掉末尾多余的 0 (例如 "1.20 MB" -> "1.2 MB")
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(dm))} ${sizes[i]}`;
-};
\ No newline at end of file
+};
diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts
index 8bc37239..ebd6935d 100644
--- a/frontend/vite.config.ts
+++ b/frontend/vite.config.ts
@@ -13,54 +13,25 @@ export default defineConfig({
},
server: {
host: "0.0.0.0",
- proxy: (() => {
- const pythonProxyConfig = {
- target: "http://localhost:18000",
+ proxy: {
+ "^/api": {
+ target: "http://localhost:8080", // 本地后端服务地址
changeOrigin: true,
secure: false,
- configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
- proxy.on("proxyReq", (proxyReq: unknown) => {
- (proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
- (proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
+ rewrite: (path) => path.replace(/^\/api/, "/api"),
+ configure: (proxy, options) => {
+ // proxy 是 'http-proxy' 的实例
+ proxy.on("proxyReq", (proxyReq, req, res) => {
+ // 可以在这里修改请求头
+ proxyReq.removeHeader("referer");
+ proxyReq.removeHeader("origin");
});
- proxy.on("proxyRes", (proxyRes: unknown) => {
- const res = proxyRes as { headers: Record };
- delete res.headers["set-cookie"];
- res.headers["cookies"] = "";
+ proxy.on("proxyRes", (proxyRes, req, res) => {
+ delete proxyRes.headers["set-cookie"];
+ proxyRes.headers["cookies"] = ""; // 清除 cookies 头
});
},
- };
-
- const javaProxyConfig = {
- target: "http://localhost:8080",
- changeOrigin: true,
- secure: false,
- configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
- proxy.on("proxyReq", (proxyReq: unknown) => {
- (proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
- (proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
- });
- proxy.on("proxyRes", (proxyRes: unknown) => {
- const res = proxyRes as { headers: Record };
- delete res.headers["set-cookie"];
- res.headers["cookies"] = "";
- });
- },
- };
-
- // Python 服务: rag, synthesis, annotation, evaluation, models
- const pythonPaths = ["rag", "synthesis", "annotation", "data-collection", "evaluation", "models"];
- // Java 服务: data-management, knowledge-base
- const javaPaths = ["data-management", "knowledge-base", "operators"];
-
- const proxy: Record = {};
- for (const p of pythonPaths) {
- proxy[`/api/${p}`] = pythonProxyConfig;
- }
- for (const p of javaPaths) {
- proxy[`/api/${p}`] = javaProxyConfig;
- }
- return proxy;
- })(),
+ },
+ },
},
});
diff --git a/runtime/datamate-python/app/core/exception/codes.py b/runtime/datamate-python/app/core/exception/codes.py
index d741174b..294e6d56 100644
--- a/runtime/datamate-python/app/core/exception/codes.py
+++ b/runtime/datamate-python/app/core/exception/codes.py
@@ -86,6 +86,26 @@ def __init__(self):
RATIO_ALREADY_EXISTS: Final = ErrorCode("ratio.0003", "Task already exists", 400)
RATIO_DELETE_FAILED: Final = ErrorCode("ratio.0004", "Failed to delete task", 500)
+ # ========== 清洗模块 ==========
+ CLEANING_TASK_NOT_FOUND: Final = ErrorCode("cleaning.0001", "Cleaning task not found", 404)
+ CLEANING_NAME_DUPLICATED: Final = ErrorCode("cleaning.0002", "Cleaning task name is duplicated", 400)
+ CLEANING_TEMPLATE_NOT_FOUND: Final = ErrorCode("cleaning.0003", "Cleaning template not found", 404)
+ CLEANING_TEMPLATE_NAME_DUPLICATED: Final = ErrorCode("cleaning.0004", "Cleaning template name is duplicated", 400)
+ CLEANING_INVALID_OPERATOR_INPUT: Final = ErrorCode("cleaning.0005", "Invalid operator input/output types", 400)
+ CLEANING_INVALID_EXECUTOR_TYPE: Final = ErrorCode("cleaning.0006", "Invalid executor type", 400)
+ CLEANING_DATASET_NOT_FOUND: Final = ErrorCode("cleaning.0007", "Dataset not found", 404)
+ CLEANING_FILE_SYSTEM_ERROR: Final = ErrorCode("cleaning.0008", "File system error", 500)
+ CLEANING_SETTINGS_PARSE_ERROR: Final = ErrorCode("cleaning.0009", "Settings parse error", 400)
+ CLEANING_TASK_ID_REQUIRED: Final = ErrorCode("cleaning.0010", "Task ID is required", 400)
+
+ # ========== 算子市场模块 ==========
+ OPERATOR_NOT_FOUND: Final = ErrorCode("operator.0001", "Operator not found", 404)
+ OPERATOR_IN_INSTANCE: Final = ErrorCode("operator.0002", "Operator is in use", 400)
+ OPERATOR_CANNOT_DELETE_PREDEFINED: Final = ErrorCode("operator.0003", "Cannot delete predefined operator", 400)
+ OPERATOR_UNSUPPORTED_FILE_TYPE: Final = ErrorCode("operator.0004", "Unsupported file type", 400)
+ OPERATOR_PARSE_FAILED: Final = ErrorCode("operator.0005", "Failed to parse operator package", 400)
+ OPERATOR_FIELD_NOT_FOUND: Final = ErrorCode("operator.0006", "Required field is missing", 400)
+
# ========== 系统模块 ==========
SYSTEM_MODEL_NOT_FOUND: Final = ErrorCode("system.0006", "Model configuration not found", 404)
SYSTEM_MODEL_HEALTH_CHECK_FAILED: Final = ErrorCode("system.0007", "Model health check failed", 500)
diff --git a/runtime/datamate-python/app/core/exception/middleware.py b/runtime/datamate-python/app/core/exception/middleware.py
index 82b03ca2..561d130d 100644
--- a/runtime/datamate-python/app/core/exception/middleware.py
+++ b/runtime/datamate-python/app/core/exception/middleware.py
@@ -69,7 +69,7 @@ async def dispatch(self, request: Request, call_next):
except Exception as exc:
# 捕获所有未处理的异常
logger.error(
- f"Unhandled exception occurred at {request.method} {request.url.path}",
+ f"Unhandled exception occurred at {request.method} {request.url.path}", exc,
exc_info=True
)
return self._error_response(
diff --git a/runtime/datamate-python/app/db/models/__init__.py b/runtime/datamate-python/app/db/models/__init__.py
index 2b83de26..060e4b64 100644
--- a/runtime/datamate-python/app/db/models/__init__.py
+++ b/runtime/datamate-python/app/db/models/__init__.py
@@ -21,6 +21,17 @@
EvaluationItem
)
+from .operator import (
+ Operator,
+ Category,
+ CategoryRelation,
+ OperatorRelease
+)
+
+from .chunk_upload import (
+ ChunkUploadPreRequest
+)
+
__all__ = [
"Dataset",
"DatasetTag",
@@ -32,4 +43,9 @@
"LabelingProject",
"EvaluationTask",
"EvaluationItem",
+ "Operator",
+ "Category",
+ "CategoryRelation",
+ "OperatorRelease",
+ "ChunkUploadPreRequest",
]
diff --git a/runtime/datamate-python/app/db/models/chunk_upload.py b/runtime/datamate-python/app/db/models/chunk_upload.py
new file mode 100644
index 00000000..e110af98
--- /dev/null
+++ b/runtime/datamate-python/app/db/models/chunk_upload.py
@@ -0,0 +1,38 @@
+"""
+Chunk Upload Database Model
+分片上传数据库模型
+"""
+from sqlalchemy import Column, String, Integer, DateTime
+from sqlalchemy.sql import func
+
+from app.db.models.base_entity import Base
+
+
+class ChunkUploadPreRequest(Base):
+ """分片上传预请求"""
+ __tablename__ = "t_chunk_upload_request"
+
+ id = Column(String(36), primary_key=True, comment="请求ID")
+ total_file_num = Column(Integer, nullable=False, comment="总文件数")
+ uploaded_file_num = Column(Integer, nullable=True, comment="已上传文件数")
+ upload_path = Column(String(512), nullable=False, comment="文件路径")
+ timeout = Column(DateTime, nullable=False, comment="上传请求超时时间")
+ service_id = Column(String(64), nullable=True, comment="上传请求所属服务ID")
+ check_info = Column(String(512), nullable=True, comment="业务信息")
+
+ def increment_uploaded_file_num(self):
+ """增加已上传文件数"""
+ if self.uploaded_file_num is None:
+ self.uploaded_file_num = 1
+ else:
+ self.uploaded_file_num += 1
+
+ def is_upload_complete(self) -> bool:
+ """检查是否已完成上传"""
+ return (self.uploaded_file_num is not None and
+ self.uploaded_file_num == self.total_file_num)
+
+ def is_request_timeout(self) -> bool:
+ """检查是否已超时"""
+ from datetime import datetime
+ return self.timeout is not None and datetime.utcnow() > self.timeout
diff --git a/runtime/datamate-python/app/db/models/cleaning.py b/runtime/datamate-python/app/db/models/cleaning.py
new file mode 100644
index 00000000..c2965be9
--- /dev/null
+++ b/runtime/datamate-python/app/db/models/cleaning.py
@@ -0,0 +1,59 @@
+from sqlalchemy import Column, String, BigInteger, Integer, TIMESTAMP
+from app.db.models.base_entity import BaseEntity, Base
+
+
+class CleaningTask(BaseEntity):
+ """Data cleaning task entity"""
+ __tablename__ = "t_clean_task"
+
+ id = Column(String(36), primary_key=True, comment="Task ID")
+ name = Column(String(255), nullable=False, comment="Task name")
+ description = Column(String(1024), nullable=True, comment="Task description")
+ status = Column(String(50), nullable=False, default="PENDING", comment="Task status: PENDING, RUNNING, COMPLETED, STOPPED, FAILED")
+ src_dataset_id = Column(String(36), nullable=False, comment="Source dataset ID")
+ src_dataset_name = Column(String(255), nullable=False, comment="Source dataset name")
+ dest_dataset_id = Column(String(36), nullable=True, comment="Destination dataset ID")
+ dest_dataset_name = Column(String(255), nullable=True, comment="Destination dataset name")
+ before_size = Column(BigInteger, nullable=True, comment="Data size before cleaning")
+ after_size = Column(BigInteger, nullable=True, comment="Data size after cleaning")
+ file_count = Column(Integer, nullable=True, comment="Total file count")
+ retry_count = Column(Integer, default=0, nullable=False, comment="Retry count")
+ started_at = Column(TIMESTAMP, nullable=True, comment="Task start time")
+ finished_at = Column(TIMESTAMP, nullable=True, comment="Task finish time")
+
+
+class CleaningTemplate(BaseEntity):
+ """Data cleaning template entity"""
+ __tablename__ = "t_clean_template"
+
+ id = Column(String(36), primary_key=True, comment="Template ID")
+ name = Column(String(255), nullable=False, comment="Template name")
+ description = Column(String(1024), nullable=True, comment="Template description")
+
+
+class CleaningResult(Base):
+ """Data cleaning result entity"""
+ __tablename__ = "t_clean_result"
+
+ instance_id = Column(String(36), primary_key=True, comment="Instance ID (task or template ID)")
+ src_file_id = Column(String(36), primary_key=True, comment="Source file ID")
+ dest_file_id = Column(String(36), nullable=True, comment="Destination file ID")
+ src_name = Column(String(512), nullable=True, comment="Source file name")
+ dest_name = Column(String(512), nullable=True, comment="Destination file name")
+ src_type = Column(String(50), nullable=True, comment="Source file type")
+ dest_type = Column(String(50), nullable=True, comment="Destination file type")
+ src_size = Column(BigInteger, nullable=True, comment="Source file size")
+ dest_size = Column(BigInteger, nullable=True, comment="Destination file size")
+ status = Column(String(50), nullable=True, comment="Cleaning status: COMPLETED, FAILED, etc.")
+ result = Column(String(1024), nullable=True, comment="Cleaning result message")
+
+
+class OperatorInstance(Base):
+ """Operator instance in task or template"""
+ __tablename__ = "t_operator_instance"
+
+ instance_id = Column(String(36), primary_key=True, comment="Instance ID (task or template ID)")
+ operator_id = Column(String(36), primary_key=True, comment="Operator ID")
+ op_index = Column(Integer, nullable=False, comment="Operator execution order")
+ settings_override = Column(String(4096), nullable=True, comment="Operator settings override (JSON)")
+
diff --git a/runtime/datamate-python/app/db/models/operator.py b/runtime/datamate-python/app/db/models/operator.py
new file mode 100644
index 00000000..57362461
--- /dev/null
+++ b/runtime/datamate-python/app/db/models/operator.py
@@ -0,0 +1,70 @@
+"""
+Operator Market Data Models
+算子市场数据模型
+"""
+from sqlalchemy import Column, String, Integer, Boolean, BigInteger, Text, JSON, TIMESTAMP, Index
+from sqlalchemy.sql import func
+
+from app.db.models.base_entity import Base, BaseEntity
+
+
+class Operator(BaseEntity):
+ """算子实体"""
+ __tablename__ = "t_operator"
+
+ id = Column(String(36), primary_key=True, index=True, comment="算子ID")
+ name = Column(String(255), nullable=False, comment="算子名称")
+ description = Column(Text, nullable=True, comment="算子描述")
+ version = Column(String(50), nullable=False, comment="算子版本")
+ inputs = Column(Text, nullable=True, comment="输入定义(JSON)")
+ outputs = Column(Text, nullable=True, comment="输出定义(JSON)")
+ runtime = Column(Text, nullable=True, comment="运行时配置(JSON)")
+ settings = Column(Text, nullable=True, comment="算子设置(JSON)")
+ file_name = Column(String(255), nullable=True, comment="文件名")
+ file_size = Column(BigInteger, nullable=True, comment="文件大小(字节)")
+ metrics = Column(Text, nullable=True, comment="算子指标(JSON)")
+ usage_count = Column(Integer, default=0, nullable=False, comment="使用次数")
+ is_star = Column(Boolean, default=False, nullable=False, comment="是否收藏")
+
+ __table_args__ = (
+ Index("idx_is_star", "is_star"),
+ )
+
+
+class Category(BaseEntity):
+ """算子分类实体"""
+ __tablename__ = "t_operator_category"
+
+ id = Column(String(36), primary_key=True, index=True, comment="分类ID")
+ name = Column(String(255), nullable=False, comment="分类名称")
+ value = Column(String(255), nullable=True, comment="分类值")
+ type = Column(String(50), nullable=True, comment="分类类型")
+ parent_id = Column(String(36), nullable=False, default="0", comment="父分类ID")
+
+
+class CategoryRelation(BaseEntity):
+ """算子分类关系实体"""
+ __tablename__ = "t_operator_category_relation"
+
+ category_id = Column(String(36), primary_key=True, comment="分类ID")
+ operator_id = Column(String(36), primary_key=True, comment="算子ID")
+
+ __table_args__ = (
+ Index("idx_category_id", "category_id"),
+ Index("idx_operator_id", "operator_id"),
+ )
+
+
+class OperatorRelease(BaseEntity):
+ """算子发布版本实体"""
+ __tablename__ = "t_operator_release"
+
+ id = Column(String(36), primary_key=True, comment="算子ID")
+ version = Column(String(50), primary_key=True, comment="版本号")
+ release_date = Column(TIMESTAMP, nullable=False, default=func.now(), comment="发布时间")
+ changelog = Column(JSON, nullable=True, comment="更新日志列表")
+
+
+# Ignore data scope for operator models
+for model in [Operator, Category, CategoryRelation, OperatorRelease]:
+ model.__ignore_data_scope__ = True
diff --git a/runtime/datamate-python/app/module/__init__.py b/runtime/datamate-python/app/module/__init__.py
index 7d3c482b..edf8f547 100644
--- a/runtime/datamate-python/app/module/__init__.py
+++ b/runtime/datamate-python/app/module/__init__.py
@@ -7,6 +7,9 @@
from .evaluation.interface import router as evaluation_router
from .collection.interface import router as collection_route
from .rag.interface.rag_interface import router as rag_router
+from .operator.interface import operator_router
+from .operator.interface import category_router
+from .cleaning.interface import router as cleaning_router
router = APIRouter(
prefix="/api"
@@ -19,5 +22,8 @@
router.include_router(evaluation_router)
router.include_router(collection_route)
router.include_router(rag_router)
+router.include_router(operator_router)
+router.include_router(category_router)
+router.include_router(cleaning_router)
__all__ = ["router"]
diff --git a/runtime/datamate-python/app/module/cleaning/__init__.py b/runtime/datamate-python/app/module/cleaning/__init__.py
new file mode 100644
index 00000000..7224d83c
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/__init__.py
@@ -0,0 +1,50 @@
+from .schema import (
+ CleaningTaskStatus,
+ OperatorInstanceDto,
+ CleaningProcess,
+ CleaningTaskDto,
+ CreateCleaningTaskRequest,
+ CleaningResultDto,
+ CleaningTaskLog,
+ CleaningTemplateDto,
+ CreateCleaningTemplateRequest,
+ UpdateCleaningTemplateRequest,
+)
+
+from .repository import (
+ CleaningTaskRepository,
+ CleaningTemplateRepository,
+ CleaningResultRepository,
+ OperatorInstanceRepository,
+)
+
+from .service import (
+ CleaningTaskValidator,
+ CleaningTaskScheduler,
+ CleaningTemplateService,
+ CleaningTaskService,
+)
+
+from .runtime_client import RuntimeClient
+
+__all__ = [
+ "CleaningTaskStatus",
+ "OperatorInstanceDto",
+ "CleaningProcess",
+ "CleaningTaskDto",
+ "CreateCleaningTaskRequest",
+ "CleaningResultDto",
+ "CleaningTaskLog",
+ "CleaningTemplateDto",
+ "CreateCleaningTemplateRequest",
+ "UpdateCleaningTemplateRequest",
+ "CleaningTaskRepository",
+ "CleaningTemplateRepository",
+ "CleaningResultRepository",
+ "OperatorInstanceRepository",
+ "CleaningTaskValidator",
+ "CleaningTaskScheduler",
+ "CleaningTemplateService",
+ "CleaningTaskService",
+ "RuntimeClient",
+]
diff --git a/runtime/datamate-python/app/module/cleaning/exceptions.py b/runtime/datamate-python/app/module/cleaning/exceptions.py
new file mode 100644
index 00000000..85c0718f
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/exceptions.py
@@ -0,0 +1,57 @@
+from typing import Optional
+
+
+class CleaningException(Exception):
+ """Base exception for cleaning module"""
+ def __init__(self, message: str, details: Optional[dict] = None):
+ self.message = message
+ self.details = details
+ super().__init__(self.message)
+
+
+class CleaningNameDuplicationError(CleaningException):
+ """Exception raised when cleaning task name is duplicated"""
+ def __init__(self, name: str):
+ super().__init__(f"Cleaning task name '{name}' is duplicated")
+
+
+class CleaningTaskNotFoundError(CleaningException):
+ """Exception raised when cleaning task is not found"""
+ def __init__(self, task_id: str):
+ super().__init__(f"Cleaning task '{task_id}' not found")
+
+
+class CleaningTemplateNotFoundError(CleaningException):
+ """Exception raised when cleaning template is not found"""
+ def __init__(self, template_id: str):
+ super().__init__(f"Cleaning template '{template_id}' not found")
+
+
+class InvalidOperatorInputError(CleaningException):
+ """Exception raised when operator input/output types are invalid"""
+ def __init__(self, message: str = "Invalid operator input/output types"):
+ super().__init__(message)
+
+
+class ExecutorTypeError(CleaningException):
+ """Exception raised when executor type is invalid"""
+ def __init__(self, message: str = "Invalid executor type"):
+ super().__init__(message)
+
+
+class DatasetNotFoundError(CleaningException):
+ """Exception raised when dataset is not found"""
+ def __init__(self, dataset_id: str):
+ super().__init__(f"Dataset '{dataset_id}' not found")
+
+
+class FileSystemError(CleaningException):
+ """Exception raised when file system operations fail"""
+ def __init__(self, message: str, details: Optional[dict] = None):
+ super().__init__(f"File system error: {message}", details)
+
+
+class SettingsParseError(CleaningException):
+ """Exception raised when operator settings parsing fails"""
+ def __init__(self, message: str, details: Optional[dict] = None):
+ super().__init__(f"Settings parse error: {message}", details)
diff --git a/runtime/datamate-python/app/module/cleaning/interface/__init__.py b/runtime/datamate-python/app/module/cleaning/interface/__init__.py
new file mode 100644
index 00000000..a8d5421d
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/interface/__init__.py
@@ -0,0 +1,8 @@
+from fastapi import APIRouter
+
+from .cleaning_task_routes import router as task_router
+from .cleaning_template_routes import router as template_router
+
+router = APIRouter()
+router.include_router(task_router)
+router.include_router(template_router)
diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py
new file mode 100644
index 00000000..1f8cba2b
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py
@@ -0,0 +1,234 @@
+from typing import Optional
+
+from fastapi import APIRouter, Depends
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.logging import get_logger
+from app.db.session import get_db
+from app.module.cleaning.schema import (
+ CleaningTaskDto,
+ CreateCleaningTaskRequest,
+ CleaningResultDto,
+ CleaningTaskLog,
+)
+from app.module.cleaning.service import CleaningTaskService
+from app.module.shared.schema import StandardResponse, PaginatedData
+
+logger = get_logger(__name__)
+
+router = APIRouter(prefix="/cleaning/tasks", tags=["Cleaning Tasks"])
+
+
+def _get_operator_service():
+ """Get operator service"""
+ from app.module.operator.service import OperatorService
+ from app.module.operator.repository import (
+ OperatorRepository,
+ CategoryRelationRepository,
+ OperatorReleaseRepository,
+ )
+ from app.module.operator.parsers import ParserHolder
+ from app.module.shared.file_service import FileService
+ from app.module.shared.chunk_upload_repository import ChunkUploadRepository
+
+ return OperatorService(
+ operator_repo=OperatorRepository(None),
+ category_relation_repo=CategoryRelationRepository(None),
+ operator_release_repo=OperatorReleaseRepository(None),
+ parser_holder=ParserHolder(),
+ file_service=FileService(ChunkUploadRepository()),
+ )
+
+
+def _get_task_service(db: AsyncSession) -> CleaningTaskService:
+ """Get cleaning task service instance"""
+ from app.module.cleaning.service import (
+ CleaningTaskScheduler,
+ CleaningTaskValidator,
+ )
+ from app.module.cleaning.repository import (
+ CleaningTaskRepository,
+ CleaningResultRepository,
+ OperatorInstanceRepository,
+ )
+ from app.module.cleaning.runtime_client import RuntimeClient
+ from app.module.dataset.service import DatasetManagementService
+ from app.module.shared.common.lineage import LineageService
+
+ runtime_client = RuntimeClient()
+ scheduler = CleaningTaskScheduler(
+ task_repo=CleaningTaskRepository(None),
+ runtime_client=runtime_client
+ )
+ operator_service = _get_operator_service()
+ dataset_service = DatasetManagementService(db)
+ lineage_service = LineageService(db)
+
+ task_repo = CleaningTaskRepository(None)
+
+ return CleaningTaskService(
+ task_repo=task_repo,
+ result_repo=CleaningResultRepository(None),
+ operator_instance_repo=OperatorInstanceRepository(None),
+ operator_service=operator_service,
+ scheduler=scheduler,
+ validator=CleaningTaskValidator(task_repo=task_repo, template_repo=None),
+ dataset_service=dataset_service,
+ lineage_service=lineage_service,
+ )
+
+
+@router.get(
+ "",
+ response_model=StandardResponse[PaginatedData[CleaningTaskDto]],
+ summary="查询清洗任务列表",
+ description="根据参数查询清洗任务列表(支持分页、状态过滤、关键词搜索)",
+ tags=['mcp']
+)
+async def get_cleaning_tasks(
+ page: int = 0,
+ size: int = 10,
+ status: Optional[str] = None,
+ keyword: Optional[str] = None,
+ db: AsyncSession = Depends(get_db),
+):
+ """Query cleaning tasks"""
+ task_service = _get_task_service(db)
+
+ tasks = await task_service.get_tasks(db, status, keyword, page, size)
+ count = await task_service.count_tasks(db, status, keyword)
+ total_pages = (count + size - 1) // size if size > 0 else 0
+
+ return StandardResponse(
+ code="0",
+ message="success",
+ data=PaginatedData(
+ page=page,
+ size=size,
+ total_elements=count,
+ total_pages=total_pages,
+ content=tasks,
+ )
+ )
+
+
+@router.post(
+ "",
+ response_model=StandardResponse[CleaningTaskDto],
+ summary="创建清洗任务",
+ description="根据模板ID或算子列表创建清洗任务",
+ tags=['mcp']
+)
+async def create_cleaning_task(
+ request: CreateCleaningTaskRequest,
+ db: AsyncSession = Depends(get_db),
+):
+ """Create cleaning task"""
+ task_service = _get_task_service(db)
+
+ task = await task_service.create_task(db, request)
+ await db.commit()
+
+ await task_service.execute_task(db, task.id)
+ await db.commit()
+
+ return StandardResponse(code="0", message="success", data=task)
+
+
+@router.get(
+ "/{task_id}",
+ response_model=StandardResponse[CleaningTaskDto],
+ summary="获取清洗任务详情",
+ description="根据ID获取清洗任务详细信息"
+)
+async def get_cleaning_task(
+ task_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Get cleaning task by ID"""
+ task_service = _get_task_service(db)
+ task = await task_service.get_task(db, task_id)
+ return StandardResponse(code="0", message="success", data=task)
+
+
+@router.delete(
+ "/{task_id}",
+ response_model=StandardResponse[str],
+ summary="删除清洗任务",
+ description="删除指定的清洗任务"
+)
+async def delete_cleaning_task(
+ task_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Delete cleaning task"""
+ task_service = _get_task_service(db)
+ await task_service.delete_task(db, task_id)
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=task_id)
+
+
+@router.post(
+ "/{task_id}/stop",
+ response_model=StandardResponse[str],
+ summary="停止清洗任务",
+ description="停止正在运行的清洗任务"
+)
+async def stop_cleaning_task(
+ task_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Stop cleaning task"""
+ task_service = _get_task_service(db)
+ await task_service.stop_task(db, task_id)
+ return StandardResponse(code="0", message="success", data=task_id)
+
+
+@router.post(
+ "/{task_id}/execute",
+ response_model=StandardResponse[str],
+ summary="执行清洗任务",
+ description="重新执行清洗任务"
+)
+async def execute_cleaning_task(
+ task_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Execute cleaning task"""
+ task_service = _get_task_service(db)
+ await task_service.execute_task(db, task_id)
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=task_id)
+
+
+@router.get(
+ "/{task_id}/result",
+ response_model=StandardResponse[list[CleaningResultDto]],
+ summary="获取清洗任务结果",
+ description="获取指定清洗任务的执行结果"
+)
+async def get_cleaning_task_results(
+ task_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Get cleaning task results"""
+ task_service = _get_task_service(db)
+ results = await task_service.get_task_results(db, task_id)
+ return StandardResponse(code="0", message="success", data=results)
+
+
+@router.get(
+ "/{task_id}/log/{retry_count}",
+ response_model=StandardResponse[list[CleaningTaskLog]],
+ summary="获取清洗任务日志",
+ description="获取指定清洗任务的执行日志"
+)
+async def get_cleaning_task_log(
+ task_id: str,
+ retry_count: int,
+ db: AsyncSession = Depends(get_db),
+):
+ """Get cleaning task log"""
+ task_service = _get_task_service(db)
+ logs = await task_service.get_task_log(db, task_id, retry_count)
+ return StandardResponse(code="0", message="success", data=logs)
diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py
new file mode 100644
index 00000000..102a625e
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py
@@ -0,0 +1,180 @@
+import math
+from typing import Optional
+
+from fastapi import APIRouter, Depends, Query
+from sqlalchemy import select, func
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.logging import get_logger
+from app.db.session import get_db
+from app.module.cleaning.schema import (
+ CleaningTemplateDto,
+ CreateCleaningTemplateRequest,
+ UpdateCleaningTemplateRequest,
+)
+from app.module.cleaning.service import CleaningTemplateService
+from app.module.shared.schema import StandardResponse, PaginatedData
+
+logger = get_logger(__name__)
+
+router = APIRouter(prefix="/cleaning/templates", tags=["Cleaning Templates"])
+
+
+def _get_operator_service():
+ """Get operator service"""
+ from app.module.operator.service import OperatorService
+ from app.module.operator.repository import (
+ OperatorRepository,
+ CategoryRelationRepository,
+ OperatorReleaseRepository,
+ )
+ from app.module.operator.parsers import ParserHolder
+ from app.module.shared.file_service import FileService
+ from app.module.shared.chunk_upload_repository import ChunkUploadRepository
+
+ return OperatorService(
+ operator_repo=OperatorRepository(None),
+ category_relation_repo=CategoryRelationRepository(None),
+ operator_release_repo=OperatorReleaseRepository(None),
+ parser_holder=ParserHolder(),
+ file_service=FileService(ChunkUploadRepository()),
+ )
+
+
+def _get_template_service(db: AsyncSession) -> CleaningTemplateService:
+ """Get cleaning template service instance"""
+ from app.module.cleaning.service import CleaningTaskValidator
+ from app.module.cleaning.repository import (
+ CleaningTemplateRepository,
+ OperatorInstanceRepository,
+ )
+
+ operator_service = _get_operator_service()
+
+ template_repo = CleaningTemplateRepository(None)
+
+ return CleaningTemplateService(
+ template_repo=template_repo,
+ operator_instance_repo=OperatorInstanceRepository(None),
+ operator_service=operator_service,
+ validator=CleaningTaskValidator(task_repo=None, template_repo=template_repo),
+ )
+
+
+@router.get(
+ "",
+ response_model=StandardResponse[PaginatedData[CleaningTemplateDto]],
+ summary="查询清洗模板列表",
+ description="分页查询清洗模板"
+)
+async def get_cleaning_templates(
+ page: int = Query(1, description="页码"),
+ size: int = Query(20, description="每页数量"),
+ keyword: Optional[str] = Query(None, description="关键词搜索"),
+ db: AsyncSession = Depends(get_db),
+):
+ """Query cleaning templates with pagination"""
+ from app.db.models.cleaning import CleaningTemplate
+
+ template_service = _get_template_service(db)
+
+ query = select(CleaningTemplate)
+
+ if keyword:
+ keyword_pattern = f"%{keyword}%"
+ query = query.where(
+ CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern)
+ )
+
+ count_query = select(func.count()).select_from(query.subquery())
+ total = (await db.execute(count_query)).scalar_one()
+
+ items = await template_service.get_templates(db, keyword)
+
+ total_pages = math.ceil(total / size) if total > 0 else 0
+
+ return StandardResponse(
+ code="0",
+ message="success",
+ data=PaginatedData(
+ content=items,
+ total_elements=total,
+ total_pages=total_pages,
+ page=page,
+ size=size,
+ )
+ )
+
+
+@router.post(
+ "",
+ response_model=StandardResponse[CleaningTemplateDto],
+ summary="创建清洗模板",
+ description="创建新的清洗模板"
+)
+async def create_cleaning_template(
+ request: CreateCleaningTemplateRequest,
+ db: AsyncSession = Depends(get_db),
+):
+ """Create cleaning template"""
+ template_service = _get_template_service(db)
+
+ template = await template_service.create_template(db, request)
+ await db.commit()
+
+ return StandardResponse(code="0", message="success", data=template)
+
+
+@router.get(
+ "/{template_id}",
+ response_model=StandardResponse[CleaningTemplateDto],
+ summary="获取清洗模板详情",
+ description="根据ID获取清洗模板详细信息"
+)
+async def get_cleaning_template(
+ template_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Get cleaning template by ID"""
+ template_service = _get_template_service(db)
+
+ template = await template_service.get_template(db, template_id)
+ return StandardResponse(code="0", message="success", data=template)
+
+
+@router.put(
+ "/{template_id}",
+ response_model=StandardResponse[CleaningTemplateDto],
+ summary="更新清洗模板",
+ description="更新清洗模板信息"
+)
+async def update_cleaning_template(
+ template_id: str,
+ request: UpdateCleaningTemplateRequest,
+ db: AsyncSession = Depends(get_db),
+):
+ """Update cleaning template"""
+ template_service = _get_template_service(db)
+
+ template = await template_service.update_template(db, template_id, request)
+ await db.commit()
+
+ return StandardResponse(code="0", message="success", data=template)
+
+
+@router.delete(
+ "/{template_id}",
+ response_model=StandardResponse[str],
+ summary="删除清洗模板",
+ description="删除指定的清洗模板"
+)
+async def delete_cleaning_template(
+ template_id: str,
+ db: AsyncSession = Depends(get_db),
+):
+ """Delete cleaning template"""
+ template_service = _get_template_service(db)
+ await template_service.delete_template(db, template_id)
+ await db.commit()
+
+ return StandardResponse(code="0", message="success", data=template_id)
diff --git a/runtime/datamate-python/app/module/cleaning/repository/__init__.py b/runtime/datamate-python/app/module/cleaning/repository/__init__.py
new file mode 100644
index 00000000..f8663a94
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/repository/__init__.py
@@ -0,0 +1,11 @@
+from .cleaning_task_repository import CleaningTaskRepository
+from .cleaning_template_repository import CleaningTemplateRepository
+from .cleaning_result_repository import CleaningResultRepository
+from .operator_instance_repository import OperatorInstanceRepository
+
+__all__ = [
+ "CleaningTaskRepository",
+ "CleaningTemplateRepository",
+ "CleaningResultRepository",
+ "OperatorInstanceRepository",
+]
diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py
new file mode 100644
index 00000000..a6aa62e3
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py
@@ -0,0 +1,75 @@
+from typing import List, Optional
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete
+from app.db.models.cleaning import CleaningResult
+from app.module.cleaning.schema import CleaningResultDto
+
+
+class CleaningResultRepository:
+ """Repository for cleaning result operations"""
+
+ def __init__(self, model=None):
+ self.model = model if model else CleaningResult
+
+ async def find_by_instance_id(
+ self,
+ db: AsyncSession,
+ instance_id: str,
+ status: Optional[str] = None
+ ) -> List[CleaningResultDto]:
+ """Query results by instance ID"""
+ query = select(self.model).where(self.model.instance_id == instance_id)
+
+ if status:
+ query = query.where(self.model.status == status)
+
+ result = await db.execute(query)
+ results = result.scalars().all()
+
+ return [
+ CleaningResultDto(
+ instance_id=res.instance_id,
+ src_file_id=res.src_file_id,
+ dest_file_id=res.dest_file_id,
+ src_name=res.src_name,
+ dest_name=res.dest_name,
+ src_type=res.src_type,
+ dest_type=res.dest_type,
+ src_size=res.src_size,
+ dest_size=res.dest_size,
+ status=res.status,
+ result=res.result
+ )
+ for res in results
+ ]
+
+ async def count_by_instance_id(
+ self,
+ db: AsyncSession,
+ instance_id: str
+ ) -> tuple[int, int]:
+ """Count results by instance ID (completed, failed)"""
+ total_query = select(self.model).where(self.model.instance_id == instance_id)
+ completed_query = total_query.where(self.model.status == "COMPLETED")
+ failed_query = total_query.where(self.model.status == "FAILED")
+
+ total = len((await db.execute(total_query)).scalars().all())
+ completed = len((await db.execute(completed_query)).scalars().all())
+ failed = len((await db.execute(failed_query)).scalars().all())
+
+ return (completed, failed)
+
+ async def delete_by_instance_id(
+ self,
+ db: AsyncSession,
+ instance_id: str,
+ status: Optional[str] = None
+ ) -> None:
+ """Delete results by instance ID"""
+ query = delete(self.model).where(self.model.instance_id == instance_id)
+
+ if status:
+ query = query.where(self.model.status == status)
+
+ await db.execute(query)
+ await db.flush()
diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py
new file mode 100644
index 00000000..7c83d9a2
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py
@@ -0,0 +1,140 @@
+from typing import List, Optional
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete, func
+from app.db.models.cleaning import CleaningTask
+from app.module.cleaning.schema import CleaningTaskDto
+
+
+class CleaningTaskRepository:
+ """Repository for cleaning task operations"""
+
+ def __init__(self, model=None):
+ self.model = model if model else CleaningTask
+
+ async def find_tasks(
+ self,
+ db: AsyncSession,
+ status: Optional[str] = None,
+ keyword: Optional[str] = None,
+ page: Optional[int] = None,
+ size: Optional[int] = None
+ ) -> List[CleaningTaskDto]:
+ """Query cleaning tasks"""
+ query = select(self.model)
+
+ if status:
+ query = query.where(self.model.status == status)
+
+ if keyword:
+ keyword_pattern = f"%{keyword}%"
+ query = query.where(
+ self.model.name.ilike(keyword_pattern) | self.model.description.ilike(keyword_pattern)
+ )
+
+ query = query.order_by(self.model.created_at.desc())
+
+ if page is not None and size is not None:
+ offset = max((page - 1) * size, 0)
+ query = query.offset(offset).limit(size)
+
+ result = await db.execute(query)
+ tasks = result.scalars().all()
+
+ return [
+ CleaningTaskDto(
+ id=task.id,
+ name=task.name,
+ description=task.description,
+ status=task.status,
+ src_dataset_id=task.src_dataset_id,
+ src_dataset_name=task.src_dataset_name,
+ dest_dataset_id=task.dest_dataset_id,
+ dest_dataset_name=task.dest_dataset_name,
+ before_size=task.before_size,
+ after_size=task.after_size,
+ file_count=task.file_count,
+ retry_count=task.retry_count,
+ started_at=task.started_at,
+ finished_at=task.finished_at,
+ created_at=task.created_at
+ )
+ for task in tasks
+ ]
+
+ async def find_task_by_id(self, db: AsyncSession, task_id: str) -> Optional[CleaningTaskDto]:
+ """Query task by ID"""
+ query = select(self.model).where(self.model.id == task_id)
+ result = await db.execute(query)
+ task = result.scalar_one_or_none()
+
+ if not task:
+ return None
+
+ return CleaningTaskDto(
+ id=task.id,
+ name=task.name,
+ description=task.description,
+ status=task.status,
+ src_dataset_id=task.src_dataset_id,
+ src_dataset_name=task.src_dataset_name,
+ dest_dataset_id=task.dest_dataset_id,
+ dest_dataset_name=task.dest_dataset_name,
+ before_size=task.before_size,
+ after_size=task.after_size,
+ file_count=task.file_count,
+ retry_count=task.retry_count,
+ started_at=task.started_at,
+ finished_at=task.finished_at,
+ created_at=task.created_at
+ )
+
+ async def insert_task(self, db: AsyncSession, task: CleaningTaskDto) -> None:
+ """Insert new task"""
+ from app.db.models.cleaning import CleaningTask as CleaningTaskModel
+
+ db_task = CleaningTaskModel(
+ id=task.id,
+ name=task.name,
+ description=task.description,
+ status=task.status,
+ src_dataset_id=task.src_dataset_id,
+ src_dataset_name=task.src_dataset_name,
+ dest_dataset_id=task.dest_dataset_id,
+ dest_dataset_name=task.dest_dataset_name,
+ before_size=task.before_size,
+ after_size=task.after_size,
+ file_count=task.file_count,
+ retry_count=task.retry_count
+ )
+ db.add(db_task)
+ await db.flush()
+
+ async def update_task(self, db: AsyncSession, task: CleaningTaskDto) -> None:
+ """Update task"""
+ query = select(CleaningTask).where(CleaningTask.id == task.id)
+ result = await db.execute(query)
+ db_task = result.scalar_one_or_none()
+
+ if db_task:
+ if task.status:
+ db_task.status = task.status
+ if task.started_at:
+ db_task.started_at = task.started_at
+ if task.finished_at:
+ db_task.finished_at = task.finished_at
+ if task.retry_count is not None:
+ db_task.retry_count = task.retry_count
+
+ await db.flush()
+
+ async def delete_task_by_id(self, db: AsyncSession, task_id: str) -> None:
+ """Delete task by ID"""
+ query = delete(self.model).where(self.model.id == task_id)
+ await db.execute(query)
+ await db.flush()
+
+ async def is_name_exist(self, db: AsyncSession, name: str) -> bool:
+ """Check if task name exists"""
+ query = select(func.count()).select_from(self.model).where(self.model.name == name)
+ result = await db.execute(query)
+ return result.scalar_one() > 0 if result else False
diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py
new file mode 100644
index 00000000..aa35ba71
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py
@@ -0,0 +1,63 @@
+from typing import List, Optional
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete, func
+from app.db.models.cleaning import CleaningTemplate
+
+
+class CleaningTemplateRepository:
+ """Repository for cleaning template operations"""
+
+ def __init__(self, model=None):
+ self.model = model if model else CleaningTemplate
+
+ async def find_all_templates(
+ self,
+ db: AsyncSession,
+ keyword: Optional[str] = None
+ ) -> List[CleaningTemplate]:
+ """Query all templates"""
+ query = select(self.model)
+
+ if keyword:
+ keyword_pattern = f"%{keyword}%"
+ query = query.where(
+ self.model.name.ilike(keyword_pattern) | self.model.description.ilike(keyword_pattern)
+ )
+
+ query = query.order_by(self.model.created_at.desc())
+ result = await db.execute(query)
+ return result.scalars().all()
+
+ async def find_template_by_id(self, db: AsyncSession, template_id: str) -> Optional[CleaningTemplate]:
+ """Query template by ID"""
+ query = select(self.model).where(self.model.id == template_id)
+ result = await db.execute(query)
+ return result.scalar_one_or_none()
+
+ async def insert_template(self, db: AsyncSession, template: CleaningTemplate) -> None:
+ """Insert new template"""
+ db.add(template)
+ await db.flush()
+
+ async def update_template(self, db: AsyncSession, template: CleaningTemplate) -> None:
+ """Update template"""
+ query = select(self.model).where(self.model.id == template.id)
+ result = await db.execute(query)
+ db_template = result.scalar_one_or_none()
+
+ if db_template:
+ db_template.name = template.name
+ db_template.description = template.description
+ await db.flush()
+
+ async def delete_template(self, db: AsyncSession, template_id: str) -> None:
+ """Delete template"""
+ query = delete(self.model).where(self.model.id == template_id)
+ await db.execute(query)
+ await db.flush()
+
+ async def is_name_exist(self, db: AsyncSession, name: str) -> bool:
+ """Check if template name exists"""
+ query = select(func.count()).select_from(self.model).where(self.model.name == name)
+ result = await db.execute(query)
+ return result.scalar_one() > 0 if result else False
diff --git a/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py b/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py
new file mode 100644
index 00000000..b8a20b1d
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py
@@ -0,0 +1,56 @@
+import json
+from typing import List
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete
+from app.db.models.cleaning import OperatorInstance
+
+
+class OperatorInstanceRepository:
+ """Repository for operator instance operations"""
+
+ def __init__(self, model=None):
+ self.model = model if model else OperatorInstance
+
+ async def find_operator_by_instance_id(
+ self,
+ db: AsyncSession,
+ instance_id: str
+ ) -> List[OperatorInstance]:
+ """Query operator instances by instance ID"""
+ query = select(self.model).where(self.model.instance_id == instance_id)
+ query = query.order_by(self.model.op_index.asc())
+ result = await db.execute(query)
+ return result.scalars().all()
+
+ async def find_instance_by_instance_id(
+ self,
+ db: AsyncSession,
+ instance_id: str
+ ) -> List[OperatorInstance]:
+ """Query instances for template (same as find_operator_by_instance_id)"""
+ return await self.find_operator_by_instance_id(db, instance_id)
+
+ async def insert_instance(
+ self,
+ db: AsyncSession,
+ instance_id: str,
+ instances: List
+ ) -> None:
+ """Insert operator instances"""
+ from app.db.models.cleaning import OperatorInstance as OperatorInstanceModel
+
+ for idx, instance in enumerate(instances):
+ db_instance = OperatorInstanceModel(
+ instance_id=instance_id,
+ operator_id=instance.id,
+ op_index=idx,
+ settings_override=json.dumps(instance.overrides),
+ )
+ db.add(db_instance)
+ await db.flush()
+
+ async def delete_by_instance_id(self, db: AsyncSession, instance_id: str) -> None:
+ """Delete instances by instance ID"""
+ query = delete(self.model).where(self.model.instance_id == instance_id)
+ await db.execute(query)
+ await db.flush()
diff --git a/runtime/datamate-python/app/module/cleaning/runtime_client.py b/runtime/datamate-python/app/module/cleaning/runtime_client.py
new file mode 100644
index 00000000..0983256f
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/runtime_client.py
@@ -0,0 +1,61 @@
+import httpx
+from typing import Optional
+from app.core.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class RuntimeClient:
+ """HTTP client for communicating with runtime service"""
+
+ def __init__(self, base_url: str = "http://datamate-runtime:8081"):
+ self.base_url = base_url
+ self.client = httpx.AsyncClient(timeout=60.0)
+
+ async def submit_task(self, task_id: str) -> bool:
+ """Submit cleaning task to runtime executor"""
+ try:
+ url = f"{self.base_url}/api/task/{task_id}/submit"
+ response = await self.client.post(url)
+ response.raise_for_status()
+ logger.info(f"Task {task_id} submitted successfully")
+ return True
+ except httpx.HTTPError as e:
+ logger.error(f"Failed to submit task {task_id}: {e}")
+ return False
+ except Exception as e:
+ logger.error(f"Unexpected error submitting task {task_id}: {e}")
+ return False
+
+ async def stop_task(self, task_id: str) -> bool:
+ """Stop running cleaning task"""
+ try:
+ url = f"{self.base_url}/api/task/{task_id}/stop"
+ response = await self.client.post(url)
+ response.raise_for_status()
+ logger.info(f"Task {task_id} stopped successfully")
+ return True
+ except httpx.HTTPError as e:
+ logger.error(f"Failed to stop task {task_id}: {e}")
+ return False
+ except Exception as e:
+ logger.error(f"Unexpected error stopping task {task_id}: {e}")
+ return False
+
+ async def get_task_status(self, task_id: str) -> Optional[dict]:
+ """Get task status from runtime"""
+ try:
+ url = f"{self.base_url}/api/task/{task_id}/status"
+ response = await self.client.get(url)
+ response.raise_for_status()
+ return response.json()
+ except httpx.HTTPError as e:
+ logger.error(f"Failed to get task status {task_id}: {e}")
+ return None
+ except Exception as e:
+ logger.error(f"Unexpected error getting task status {task_id}: {e}")
+ return None
+
+ async def close(self):
+ """Close HTTP client"""
+ await self.client.aclose()
diff --git a/runtime/datamate-python/app/module/cleaning/schema/__init__.py b/runtime/datamate-python/app/module/cleaning/schema/__init__.py
new file mode 100644
index 00000000..6a38375b
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/schema/__init__.py
@@ -0,0 +1,25 @@
+from .cleaning import (
+ CleaningTaskStatus,
+ OperatorInstanceDto,
+ CleaningProcess,
+ CleaningTaskDto,
+ CreateCleaningTaskRequest,
+ CleaningResultDto,
+ CleaningTaskLog,
+ CleaningTemplateDto,
+ CreateCleaningTemplateRequest,
+ UpdateCleaningTemplateRequest,
+)
+
+__all__ = [
+ "CleaningTaskStatus",
+ "OperatorInstanceDto",
+ "CleaningProcess",
+ "CleaningTaskDto",
+ "CreateCleaningTaskRequest",
+ "CleaningResultDto",
+ "CleaningTaskLog",
+ "CleaningTemplateDto",
+ "CreateCleaningTemplateRequest",
+ "UpdateCleaningTemplateRequest",
+]
diff --git a/runtime/datamate-python/app/module/cleaning/schema/cleaning.py b/runtime/datamate-python/app/module/cleaning/schema/cleaning.py
new file mode 100644
index 00000000..0571b29c
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/schema/cleaning.py
@@ -0,0 +1,138 @@
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field
+from datetime import datetime
+from app.module.shared.schema.common import BaseResponseModel
+
+
+class CleaningTaskStatus:
+ PENDING = "PENDING"
+ RUNNING = "RUNNING"
+ COMPLETED = "COMPLETED"
+ STOPPED = "STOPPED"
+ FAILED = "FAILED"
+
+
+class OperatorInstanceDto(BaseResponseModel):
+ """Operator instance DTO for task or template"""
+ id: str = Field(..., description="Operator ID")
+ name: Optional[str] = Field(None, description="Operator name")
+ description: Optional[str] = Field(None, description="Operator description")
+ inputs: Optional[str] = Field(None, description="Input types: text/image/audio/video/multimodal")
+ outputs: Optional[str] = Field(None, description="Output types: text/image/audio/video/multimodal")
+ categories: Optional[List[str]] = Field(None, description="Category IDs")
+ settings: Optional[str] = Field(None, description="算子设置(JSON)")
+ overrides: Dict[str, Any] = Field(default_factory=dict, description="Operator parameter overrides")
+
+
+class CleaningProcess(BaseResponseModel):
+ """Task progress information (matches Java version)"""
+ process: float = Field(..., description="Progress percentage")
+ successRate: float = Field(..., description="Success rate percentage")
+ totalFileNum: int = Field(..., description="Total file count")
+ succeedFileNum: int = Field(..., description="Succeeded file count")
+ failedFileNum: int = Field(..., description="Failed file count")
+ finishedFileNum: int = Field(..., description="Finished file count")
+
+ @classmethod
+ def of(cls, total: int, succeed: int, failed: int) -> 'CleaningProcess':
+ """Create progress info (matches Java version logic)"""
+ finished_file_num = succeed + failed
+
+ if total == 0:
+ process = 0.0
+ else:
+ process = round(finished_file_num * 100.0 / total, 2)
+
+ if finished_file_num == 0:
+ success_rate = 0.0
+ else:
+ success_rate = round(succeed * 100.0 / finished_file_num, 2)
+
+ return cls(
+ process=process,
+ successRate=success_rate,
+ totalFileNum=total,
+ succeedFileNum=succeed,
+ failedFileNum=failed,
+ finishedFileNum=finished_file_num,
+ )
+
+
+class CleaningTaskDto(BaseResponseModel):
+ """Cleaning task DTO"""
+ id: Optional[str] = Field(None, description="Task ID")
+ name: Optional[str] = Field(None, description="Task name")
+ description: Optional[str] = Field(None, description="Task description")
+ src_dataset_id: Optional[str] = Field(None, description="Source dataset ID")
+ src_dataset_name: Optional[str] = Field(None, description="Source dataset name")
+ dest_dataset_id: Optional[str] = Field(None, description="Destination dataset ID")
+ dest_dataset_name: Optional[str] = Field(None, description="Destination dataset name")
+ before_size: Optional[int] = Field(None, description="Data size before cleaning")
+ after_size: Optional[int] = Field(None, description="Data size after cleaning")
+ file_count: Optional[int] = Field(None, description="Total file count")
+ retry_count: Optional[int] = Field(None, description="Retry count")
+ status: Optional[str] = Field(None, description="Task status")
+ template_id: Optional[str] = Field(None, description="Template ID if created from template")
+ instance: Optional[List[OperatorInstanceDto]] = Field(None, description="Operator instances")
+ progress: Optional[CleaningProcess] = Field(None, description="Task progress")
+ created_at: Optional[datetime] = Field(None, description="Creation time")
+ started_at: Optional[datetime] = Field(None, description="Start time")
+ finished_at: Optional[datetime] = Field(None, description="Finish time")
+
+
+class CreateCleaningTaskRequest(BaseResponseModel):
+ """Request to create cleaning task"""
+ name: str = Field(..., description="Cleaning task name")
+ description: str = Field(..., description="Cleaning task description")
+ src_dataset_id: str = Field(..., description="Source dataset ID")
+ src_dataset_name: str = Field(..., description="Source dataset name")
+ dest_dataset_id: Optional[str] = Field(None, description="Destination dataset ID")
+ dest_dataset_name: str = Field(..., description="Destination dataset name, creates new dataset if destDatasetId is empty")
+ dest_dataset_type: str = Field(..., description="Destination dataset type: TEXT/IMAGE/VIDEO/AUDIO/OTHER")
+ template_id: Optional[str] = Field(None, description="Template ID (alternative to instance)")
+ instance: List[OperatorInstanceDto] = Field(default_factory=list, description="Operator list (alternative to templateId)")
+
+
+class CleaningResultDto(BaseResponseModel):
+ """Cleaning result DTO"""
+ instance_id: Optional[str] = Field(None, description="Instance ID")
+ src_file_id: Optional[str] = Field(None, description="Source file ID")
+ dest_file_id: Optional[str] = Field(None, description="Destination file ID")
+ src_name: Optional[str] = Field(None, description="Source file name")
+ dest_name: Optional[str] = Field(None, description="Destination file name")
+ src_type: Optional[str] = Field(None, description="Source file type")
+ dest_type: Optional[str] = Field(None, description="Destination file type")
+ src_size: Optional[int] = Field(None, description="Source file size")
+ dest_size: Optional[int] = Field(None, description="Destination file size")
+ status: Optional[str] = Field(None, description="Cleaning status")
+ result: Optional[str] = Field(None, description="Cleaning result message")
+
+
+class CleaningTaskLog(BaseResponseModel):
+ """Task log entry"""
+ level: str = Field(..., description="Log level: INFO, WARN, ERROR")
+ message: str = Field(..., description="Log message")
+
+
+class CleaningTemplateDto(BaseResponseModel):
+ """Cleaning template DTO"""
+ id: Optional[str] = Field(None, description="Template ID")
+ name: Optional[str] = Field(None, description="Template name")
+ description: Optional[str] = Field(None, description="Template description")
+ instance: List[OperatorInstanceDto] = Field(default_factory=list, description="Operator instances")
+ created_at: Optional[datetime] = Field(None, description="Creation time")
+ updated_at: Optional[datetime] = Field(None, description="Update time")
+
+
+class CreateCleaningTemplateRequest(BaseResponseModel):
+ """Request to create cleaning template"""
+ name: str = Field(..., description="Template name")
+ description: str = Field(..., description="Template description")
+ instance: List[OperatorInstanceDto] = Field(..., description="Operator instances")
+
+
+class UpdateCleaningTemplateRequest(BaseResponseModel):
+ """Request to update cleaning template"""
+ name: str = Field(..., description="Template name")
+ description: str = Field(..., description="Template description")
+ instance: List[OperatorInstanceDto] = Field(..., description="Operator instances")
diff --git a/runtime/datamate-python/app/module/cleaning/service/__init__.py b/runtime/datamate-python/app/module/cleaning/service/__init__.py
new file mode 100644
index 00000000..ed305edf
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/service/__init__.py
@@ -0,0 +1,11 @@
+from .cleaning_task_validator import CleaningTaskValidator
+from .cleaning_task_scheduler import CleaningTaskScheduler
+from .cleaning_template_service import CleaningTemplateService
+from .cleaning_task_service import CleaningTaskService
+
+__all__ = [
+ "CleaningTaskValidator",
+ "CleaningTaskScheduler",
+ "CleaningTemplateService",
+ "CleaningTaskService",
+]
diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py
new file mode 100644
index 00000000..cd1d1321
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py
@@ -0,0 +1,41 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+from app.core.logging import get_logger
+from app.module.cleaning.repository import CleaningTaskRepository
+from app.module.cleaning.runtime_client import RuntimeClient
+
+logger = get_logger(__name__)
+
+
+class CleaningTaskScheduler:
+ """Scheduler for executing cleaning tasks"""
+
+ def __init__(self, task_repo: CleaningTaskRepository, runtime_client: RuntimeClient):
+ self.task_repo = task_repo
+ self.runtime_client = runtime_client
+
+ async def execute_task(self, db: AsyncSession, task_id: str, retry_count: int) -> bool:
+ """Execute cleaning task"""
+ from app.module.cleaning.schema import CleaningTaskDto, CleaningTaskStatus
+ from datetime import datetime
+
+ task = CleaningTaskDto()
+ task.id = task_id
+ task.status = CleaningTaskStatus.RUNNING
+ task.started_at = datetime.now()
+ task.retry_count = retry_count
+
+ await self.task_repo.update_task(db, task)
+ return await self.runtime_client.submit_task(task_id)
+
+ async def stop_task(self, db: AsyncSession, task_id: str) -> bool:
+ """Stop cleaning task"""
+ from app.module.cleaning.schema import CleaningTaskDto, CleaningTaskStatus
+
+ await self.runtime_client.stop_task(task_id)
+
+ task = CleaningTaskDto()
+ task.id = task_id
+ task.status = CleaningTaskStatus.STOPPED
+
+ await self.task_repo.update_task(db, task)
+ return True
diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py
new file mode 100644
index 00000000..9886a18d
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py
@@ -0,0 +1,446 @@
+import json
+import re
+import shutil
+import uuid
+from pathlib import Path
+from typing import List, Dict, Any, Set
+
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.logging import get_logger
+from app.db.models.base_entity import LineageNode, LineageEdge
+from app.core.exception import BusinessError, ErrorCodes
+from app.module.cleaning.repository import (
+ CleaningTaskRepository,
+ CleaningResultRepository,
+ OperatorInstanceRepository,
+)
+from app.module.cleaning.schema import (
+ CleaningTaskDto,
+ CreateCleaningTaskRequest,
+ CleaningResultDto,
+ CleaningTaskLog,
+ OperatorInstanceDto,
+ CleaningProcess,
+ CleaningTaskStatus,
+)
+from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator
+from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler
+from app.module.shared.common.lineage import LineageService
+from app.module.shared.schema.lineage import NodeType, EdgeType
+
+logger = get_logger(__name__)
+
+DATASET_PATH = "/dataset"
+FLOW_PATH = "/flow"
+
+
+class CleaningTaskService:
+ """Service for managing cleaning tasks"""
+
+ def __init__(
+ self,
+ task_repo: CleaningTaskRepository,
+ result_repo: CleaningResultRepository,
+ operator_instance_repo: OperatorInstanceRepository,
+ operator_service,
+ scheduler: CleaningTaskScheduler,
+ validator: CleaningTaskValidator,
+ dataset_service,
+ lineage_service: LineageService,
+ ):
+ self.task_repo = task_repo
+ self.result_repo = result_repo
+ self.operator_instance_repo = operator_instance_repo
+ self.operator_service = operator_service
+ self.scheduler = scheduler
+ self.validator = validator
+ self.dataset_service = dataset_service
+ self.lineage_service = lineage_service
+
+ async def get_tasks(
+ self,
+ db: AsyncSession,
+ status: str | None = None,
+ keyword: str | None = None,
+ page: int | None = None,
+ size: int | None = None,
+ ) -> List[CleaningTaskDto]:
+ """Get cleaning tasks"""
+ tasks = await self.task_repo.find_tasks(db, status, keyword, page, size)
+
+ for task in tasks:
+ await self._set_process(db, task)
+
+ return tasks
+
+ async def _set_process(self, db: AsyncSession, task: CleaningTaskDto) -> None:
+ """Set task progress"""
+ completed, failed = await self.result_repo.count_by_instance_id(db, task.id)
+ task.progress = CleaningProcess.of(task.file_count or 0, completed, failed)
+
+ async def count_tasks(
+ self,
+ db: AsyncSession,
+ status: str | None = None,
+ keyword: str | None = None,
+ ) -> int:
+ """Count cleaning tasks"""
+ tasks = await self.task_repo.find_tasks(db, status, keyword, None, None)
+ return len(tasks)
+
+ async def get_task(self, db: AsyncSession, task_id: str) -> CleaningTaskDto:
+ """Get task by ID"""
+ task = await self.task_repo.find_task_by_id(db, task_id)
+ if not task:
+ raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id)
+
+ await self._set_process(db, task)
+
+ instances = await self.operator_instance_repo.find_operator_by_instance_id(db, task_id)
+
+ # Batch query operators
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None,
+ is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ task.instance = []
+ for inst in instances:
+ operator = operator_map.get(inst.operator_id)
+ if operator:
+ task.instance.append(OperatorInstanceDto(
+ id=operator.id,
+ name=operator.name,
+ description=operator.description,
+ inputs=operator.inputs,
+ outputs=operator.outputs,
+ settings=operator.settings,
+ categories=operator.categories,
+ ))
+ else:
+ task.instance.append(OperatorInstanceDto(id=inst.operator_id))
+
+ return task
+
+ async def create_task(
+ self,
+ db: AsyncSession,
+ request: CreateCleaningTaskRequest
+ ) -> CleaningTaskDto:
+ """Create new cleaning task"""
+ if request.instance and request.template_id:
+ instances = await self.get_instance_by_template_id(db, request.template_id)
+ request.instance = instances
+
+ await self.validator.check_task_name_duplication(db, request.name)
+ self.validator.check_input_and_output(request.instance)
+ executor_type = self.validator.check_and_get_executor_type(request.instance)
+
+ task_id = str(uuid.uuid4())
+
+ dest_dataset_id = request.dest_dataset_id
+ dest_dataset_name = request.dest_dataset_name
+
+ if not dest_dataset_id:
+ logger.info(f"Creating new dataset: {dest_dataset_name}, type: {request.dest_dataset_type}")
+ dest_dataset_response = await self.dataset_service.create_dataset(
+ name=dest_dataset_name,
+ dataset_type=request.dest_dataset_type,
+ description="",
+ status="ACTIVE"
+ )
+ dest_dataset_id = dest_dataset_response.id
+ logger.info(f"Successfully created dataset: {dest_dataset_id}")
+ else:
+ logger.info(f"Using existing dataset: {dest_dataset_id}")
+ dest_dataset_response = await self.dataset_service.get_dataset(dest_dataset_id)
+
+ src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id)
+ if not src_dataset:
+ raise BusinessError(ErrorCodes.CLEANING_DATASET_NOT_FOUND, request.src_dataset_id)
+
+ task_dto = CleaningTaskDto(
+ id=task_id,
+ name=request.name,
+ description=request.description,
+ status=CleaningTaskStatus.PENDING,
+ src_dataset_id=request.src_dataset_id,
+ src_dataset_name=request.src_dataset_name,
+ dest_dataset_id=dest_dataset_id,
+ dest_dataset_name=dest_dataset_name,
+ before_size=src_dataset.totalSize,
+ file_count=src_dataset.fileCount,
+ retry_count=-1,
+ )
+
+ await self.task_repo.insert_task(db, task_dto)
+
+ await self._add_cleaning_to_graph(src_dataset, task_dto, dest_dataset_response)
+
+ await self.operator_instance_repo.insert_instance(db, task_id, request.instance)
+
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ await self.prepare_task(dest_dataset_id, task_id, request.instance, operator_map, executor_type)
+
+ return await self.get_task(db, task_id)
+
+ async def _add_cleaning_to_graph(
+ self,
+ src_dataset,
+ task: CleaningTaskDto,
+ dest_dataset,
+ ) -> None:
+ """
+ 添加清洗任务到血缘图
+ """
+ from_node = LineageNode(
+ id=src_dataset.id,
+ node_type=NodeType.DATASET.value,
+ name=src_dataset.name,
+ description=src_dataset.description or "",
+ )
+
+ to_node = LineageNode(
+ id=dest_dataset.id,
+ node_type=NodeType.DATASET.value,
+ name=dest_dataset.name,
+ description=dest_dataset.description or "",
+ )
+
+ edge = LineageEdge(
+ process_id=task.id,
+ name=task.name or "",
+ description=task.description or "",
+ edge_type=EdgeType.DATA_CLEANING.value,
+ from_node_id=from_node.id,
+ to_node_id=to_node.id,
+ )
+
+ await self.lineage_service.generate_graph(from_node, edge, to_node)
+
+ async def prepare_task(
+ self,
+ dataset_id: str,
+ task_id: str,
+ instances: List[OperatorInstanceDto],
+ operator_map: dict,
+ executor_type: str,
+ ) -> None:
+ """Prepare task configuration file"""
+ process_config = {
+ "dataset_id": dataset_id,
+ "instance_id": task_id,
+ "dataset_path": f"{FLOW_PATH}/{task_id}/dataset.jsonl",
+ "export_path": f"{DATASET_PATH}/{dataset_id}",
+ "executor_type": executor_type,
+ "process": [],
+ }
+
+ for instance in instances:
+ operator = operator_map.get(instance.id)
+ if not operator:
+ continue
+
+ operator_config = self._get_default_values(operator)
+ operator_config.update(instance.overrides)
+
+ runtime_config = self._get_runtime_config(operator)
+ operator_config.update(runtime_config)
+
+ process_config["process"].append({instance.id: operator_config})
+
+ config_file_path = Path(f"{FLOW_PATH}/{task_id}/process.yaml")
+ config_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+ import yaml
+ try:
+ with open(config_file_path, 'w', encoding='utf-8') as f:
+ yaml.dump(process_config, f, default_flow_style=False, allow_unicode=True)
+ except Exception as e:
+ logger.error(f"Failed to write process.yaml: {e}")
+ raise BusinessError(ErrorCodes.CLEANING_FILE_SYSTEM_ERROR, str(e))
+
+ def _get_default_values(self, operator) -> Dict[str, Any]:
+ """Get default values from operator settings"""
+ if not operator.settings:
+ return {}
+
+ try:
+ settings = json.loads(operator.settings)
+ defaults = {}
+
+ for key, value in settings.items():
+ setting_type = value.get("type")
+ if "defaultVal" in value:
+ defaults[key] = value["defaultVal"]
+
+ return defaults
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse settings: {e}")
+ return {}
+
+ def _get_runtime_config(self, operator) -> Dict[str, Any]:
+ """Get runtime configuration from operator"""
+ if not operator.runtime:
+ return {}
+
+ try:
+ return json.loads(operator.runtime)
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse runtime config: {e}")
+ return {}
+
+ async def scan_dataset(
+ self,
+ db: AsyncSession,
+ task_id: str,
+ src_dataset_id: str,
+ succeed_files: Set[str] | None = None,
+ ) -> None:
+ """Scan source dataset and create dataset.jsonl"""
+ target_file_path = Path(f"{FLOW_PATH}/{task_id}/dataset.jsonl")
+ target_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+ query = text("""
+ SELECT id, file_name, file_path, file_type, file_size
+ FROM t_dm_dataset_files
+ WHERE dataset_id = :dataset_id
+ ORDER BY created_at
+ """)
+
+ result = await db.execute(query, {"dataset_id": src_dataset_id})
+ files = result.fetchall()
+
+ with open(target_file_path, 'w', encoding='utf-8') as f:
+ for file in files:
+ if succeed_files and file.id in succeed_files:
+ continue
+
+ file_info = {
+ "fileId": file.id,
+ "fileName": file.file_name,
+ "filePath": file.file_path,
+ "fileType": file.file_type,
+ "fileSize": file.file_size,
+ }
+ f.write(json.dumps(file_info, ensure_ascii=False) + "\n")
+
+ async def get_task_results(self, db: AsyncSession, task_id: str) -> List[CleaningResultDto]:
+ """Get task results"""
+ return await self.result_repo.find_by_instance_id(db, task_id)
+
+ async def get_task_log(self, db: AsyncSession, task_id: str, retry_count: int) -> List[CleaningTaskLog]:
+ """Get task log"""
+ self.validator.check_task_id(task_id)
+
+ log_path = Path(f"{FLOW_PATH}/{task_id}/output.log")
+ if retry_count > 0:
+ log_path = Path(f"{FLOW_PATH}/{task_id}/output.log.{retry_count}")
+
+ if not log_path.exists():
+ return []
+
+ logs = []
+ last_level = "INFO"
+
+ standard_level_pattern = re.compile(
+ r"\b(DEBUG|Debug|INFO|Info|WARN|Warn|WARNING|Warning|ERROR|Error|FATAL|Fatal)\b"
+ )
+ exception_suffix_pattern = re.compile(r"\b\w+(Warning|Error|Exception)\b")
+
+ with open(log_path, 'r', encoding='utf-8') as f:
+ for line in f:
+ last_level = self._get_log_level(line, last_level, standard_level_pattern, exception_suffix_pattern)
+ logs.append(CleaningTaskLog(level=last_level, message=line.rstrip()))
+
+ return logs
+
+ def _get_log_level(self, line: str, default_level: str, std_pattern, ex_pattern) -> str:
+ """Extract log level from log line"""
+ if not line or not line.strip():
+ return default_level
+
+ std_match = std_pattern.search(line)
+ if std_match:
+ return std_match.group(1).upper()
+
+ ex_match = ex_pattern.search(line)
+ if ex_match:
+ match = ex_match.group(1).upper()
+ if match == "WARNING":
+ return "WARN"
+ if match in ["ERROR", "EXCEPTION"]:
+ return "ERROR"
+
+ return default_level
+
+ async def delete_task(self, db: AsyncSession, task_id: str) -> None:
+ """Delete task"""
+ self.validator.check_task_id(task_id)
+
+ await self.task_repo.delete_task_by_id(db, task_id)
+ await self.operator_instance_repo.delete_by_instance_id(db, task_id)
+ await self.result_repo.delete_by_instance_id(db, task_id)
+
+ task_path = Path(f"{FLOW_PATH}/{task_id}")
+ if task_path.exists():
+ try:
+ shutil.rmtree(task_path)
+ except Exception as e:
+ logger.warning(f"Failed to delete task path {task_id}: {e}")
+
+ async def execute_task(self, db: AsyncSession, task_id: str) -> bool:
+ """Execute task"""
+ succeeded = await self.result_repo.find_by_instance_id(db, task_id, "COMPLETED")
+ succeed_set = {res.src_file_id for res in succeeded}
+
+ task = await self.task_repo.find_task_by_id(db, task_id)
+ if not task:
+ raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id)
+
+ await self.scan_dataset(db, task_id, task.src_dataset_id, succeed_set)
+ await self.result_repo.delete_by_instance_id(db, task_id, "FAILED")
+
+ return await self.scheduler.execute_task(db, task_id, (task.retry_count or 0) + 1)
+
+ async def stop_task(self, db: AsyncSession, task_id: str) -> bool:
+ """Stop task"""
+ return await self.scheduler.stop_task(db, task_id)
+
+ async def get_instance_by_template_id(
+ self,
+ db: AsyncSession,
+ template_id: str
+ ) -> List[OperatorInstanceDto]:
+ """Get instances by template ID (delegated to template service)"""
+ instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id)
+
+ # Batch query operators
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None,
+ is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ result = []
+ for inst in instances:
+ operator = operator_map.get(inst.operator_id)
+ if operator:
+ operator_dto = OperatorInstanceDto(
+ id=operator.id,
+ name=operator.name,
+ description=operator.description,
+ inputs=operator.inputs,
+ outputs=operator.outputs,
+ settings=operator.settings,
+ categories=operator.categories,
+ )
+ if inst.settings_override:
+ try:
+ operator_dto.overrides = json.loads(inst.settings_override)
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse settings for {inst.operator_id}: {e}")
+ result.append(operator_dto)
+
+ return result
diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py
new file mode 100644
index 00000000..32cc47db
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py
@@ -0,0 +1,89 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.exception import BusinessError, ErrorCodes
+from app.module.cleaning.schema import OperatorInstanceDto
+from app.module.operator.constants import CATEGORY_DATA_JUICER_ID, CATEGORY_DATAMATE_ID
+
+
+class CleaningTaskValidator:
+ """Validator for cleaning tasks and templates"""
+
+ def __init__(self, task_repo=None, template_repo=None):
+ self.task_repo = task_repo
+ self.template_repo = template_repo
+
+ async def check_task_name_duplication(self, db: AsyncSession, name: str) -> None:
+ """Check if task name is duplicated"""
+ if not name:
+ raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED)
+ if await self.task_repo.is_name_exist(db, name):
+ raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED)
+
+ async def check_template_name_duplication(self, db: AsyncSession, name: str) -> None:
+ """Check if template name is duplicated"""
+ if not name:
+ raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED)
+ if await self.template_repo.is_name_exist(db, name):
+ raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED)
+
+ @staticmethod
+ def check_input_and_output(instances: list[OperatorInstanceDto]) -> None:
+ """Validate that operator input/output types are compatible"""
+ if not instances:
+ return
+
+ for i in range(len(instances) - 1):
+ current = instances[i]
+ next_op = instances[i + 1]
+
+ if not current.outputs:
+ raise BusinessError(
+ ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT,
+ f"Operator {current.id} has no outputs defined"
+ )
+
+ if not next_op.inputs:
+ raise BusinessError(
+ ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT,
+ f"Operator {next_op.id} has no inputs defined"
+ )
+
+ current_outputs = set(current.outputs.split(','))
+ next_inputs = set(next_op.inputs.split(','))
+
+ if not current_outputs.intersection(next_inputs):
+ raise BusinessError(
+ ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT,
+ f"Operator {current.id} outputs {current.outputs} "
+ f"but operator {next_op.id} requires {next_op.inputs}"
+ )
+
+ @staticmethod
+ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str:
+ """Check operator categories and determine executor type (datamate/datajuicer)"""
+ if not instances:
+ return "datamate"
+
+ executor_types = set()
+
+ for instance in instances:
+ if instance.categories:
+ for category in instance.categories:
+ if CATEGORY_DATA_JUICER_ID in category.lower():
+ executor_types.add("default")
+ elif CATEGORY_DATAMATE_ID in category.lower():
+ executor_types.add("datamate")
+
+ if len(executor_types) > 1:
+ raise BusinessError(
+ ErrorCodes.CLEANING_INVALID_EXECUTOR_TYPE,
+ "Cannot mix DataMate and DataJuicer operators in same task"
+ )
+
+ return executor_types.pop() if executor_types else "datamate"
+
+ @staticmethod
+ def check_task_id(task_id: str) -> None:
+ """Validate task ID"""
+ if not task_id:
+ raise BusinessError(ErrorCodes.CLEANING_TASK_ID_REQUIRED)
diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py
new file mode 100644
index 00000000..2443bf4f
--- /dev/null
+++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py
@@ -0,0 +1,226 @@
+import json
+import uuid
+from typing import List
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.exception import BusinessError, ErrorCodes
+from app.core.logging import get_logger
+from app.module.cleaning import UpdateCleaningTemplateRequest
+from app.module.cleaning.repository import (
+ CleaningTemplateRepository,
+ OperatorInstanceRepository,
+)
+from app.module.cleaning.schema import (
+ CleaningTemplateDto,
+ CreateCleaningTemplateRequest,
+ OperatorInstanceDto,
+)
+from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator
+
+logger = get_logger(__name__)
+
+
+class CleaningTemplateService:
+ """Service for managing cleaning templates"""
+
+ def __init__(
+ self,
+ template_repo: CleaningTemplateRepository,
+ operator_instance_repo: OperatorInstanceRepository,
+ operator_service,
+ validator: CleaningTaskValidator,
+ ):
+ self.template_repo = template_repo
+ self.operator_instance_repo = operator_instance_repo
+ self.operator_service = operator_service
+ self.validator = validator
+
+ async def get_templates(
+ self,
+ db: AsyncSession,
+ keyword: str | None = None
+ ) -> List[CleaningTemplateDto]:
+ """Get all templates"""
+ templates = await self.template_repo.find_all_templates(db, keyword)
+
+ # Collect all operator IDs
+ template_instances_map = {}
+ for template in templates:
+ instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template.id)
+ template_instances_map[template.id] = instances
+
+ # Batch query all operators
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None,
+ is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ # Build result
+ result = []
+ for template in templates:
+ template_dto = CleaningTemplateDto(
+ id=template.id,
+ name=template.name,
+ description=template.description,
+ instance=[],
+ created_at=template.created_at,
+ updated_at=template.updated_at,
+ )
+
+ instances = template_instances_map.get(template.id, [])
+ for inst in instances:
+ operator = operator_map.get(inst.operator_id)
+ if operator:
+ operator_dto = OperatorInstanceDto(
+ id=operator.id,
+ name=operator.name,
+ description=operator.description,
+ inputs=operator.inputs,
+ outputs=operator.outputs,
+ settings=operator.settings,
+ categories=operator.categories,
+ )
+ if inst.settings_override:
+ try:
+ operator_dto.overrides = json.loads(inst.settings_override)
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse settings for {inst.operator_id}: {e}")
+ template_dto.instance.append(operator_dto)
+
+ result.append(template_dto)
+
+ return result
+
+ async def get_template(
+ self,
+ db: AsyncSession,
+ template_id: str
+ ) -> CleaningTemplateDto:
+ """Get template by ID"""
+ template = await self.template_repo.find_template_by_id(db, template_id)
+ if not template:
+ raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id)
+
+ template_dto = CleaningTemplateDto(
+ id=template.id,
+ name=template.name,
+ description=template.description,
+ instance=[],
+ created_at=template.created_at,
+ updated_at=template.updated_at,
+ )
+
+ instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id)
+
+ # Batch query operators
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None,
+ is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ for inst in instances:
+ operator = operator_map.get(inst.operator_id)
+ if operator:
+ operator_dto = OperatorInstanceDto(
+ id=operator.id,
+ name=operator.name,
+ description=operator.description,
+ inputs=operator.inputs,
+ outputs=operator.outputs,
+ settings=operator.settings,
+ categories=operator.categories,
+ )
+ if inst.settings_override:
+ try:
+ operator_dto.overrides = json.loads(inst.settings_override)
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse settings for {inst.operator_id}: {e}")
+ template_dto.instance.append(operator_dto)
+
+ return template_dto
+
+ async def create_template(
+ self,
+ db: AsyncSession,
+ request: CreateCleaningTemplateRequest
+ ) -> CleaningTemplateDto:
+ """Create new template"""
+ from app.db.models.cleaning import CleaningTemplate
+
+ await self.validator.check_template_name_duplication(db, request.name)
+ self.validator.check_input_and_output(request.instance)
+ self.validator.check_and_get_executor_type(request.instance)
+
+ template_id = str(uuid.uuid4())
+ template = CleaningTemplate(
+ id=template_id,
+ name=request.name,
+ description=request.description,
+ )
+
+ await self.template_repo.insert_template(db, template)
+
+ await self.operator_instance_repo.insert_instance(db, template_id, request.instance)
+
+ return await self.get_template(db, template_id)
+
+ async def update_template(
+ self,
+ db: AsyncSession,
+ template_id: str,
+ request: UpdateCleaningTemplateRequest
+ ) -> CleaningTemplateDto:
+ """Update template"""
+
+ template = await self.template_repo.find_template_by_id(db, template_id)
+ if not template:
+ raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id)
+
+ template.name = request.name
+ template.description = request.description
+
+ await self.template_repo.update_template(db, template)
+ await self.operator_instance_repo.delete_by_instance_id(db, template_id)
+
+ await self.operator_instance_repo.insert_instance(db, template_id, request.instance)
+
+ return await self.get_template(db, template_id)
+
+ async def delete_template(self, db: AsyncSession, template_id: str) -> None:
+ """Delete template"""
+ await self.template_repo.delete_template(db, template_id)
+ await self.operator_instance_repo.delete_by_instance_id(db, template_id)
+
+ async def get_instance_by_template_id(
+ self,
+ db: AsyncSession,
+ template_id: str
+ ) -> List[OperatorInstanceDto]:
+ """Get operator instances by template ID"""
+ instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id)
+
+ # Batch query operators
+ all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None,
+ is_star=None)
+ operator_map = {op.id: op for op in all_operators}
+
+ result = []
+ for inst in instances:
+ operator = operator_map.get(inst.operator_id)
+ if operator:
+ operator_dto = OperatorInstanceDto(
+ id=operator.id,
+ name=operator.name,
+ description=operator.description,
+ inputs=operator.inputs,
+ outputs=operator.outputs,
+ settings=operator.settings,
+ categories=operator.categories,
+ )
+ if inst.settings_override:
+ try:
+ operator_dto.overrides = json.loads(inst.settings_override)
+ except json.JSONDecodeError as e:
+ logger.error(f"Failed to parse settings for {inst.operator_id}: {e}")
+ result.append(operator_dto)
+
+ return result
diff --git a/runtime/datamate-python/app/module/dataset/schema/__init__.py b/runtime/datamate-python/app/module/dataset/schema/__init__.py
index 221c43f8..6a8b0bd0 100644
--- a/runtime/datamate-python/app/module/dataset/schema/__init__.py
+++ b/runtime/datamate-python/app/module/dataset/schema/__init__.py
@@ -10,6 +10,7 @@
from .dataset import (
DatasetResponse,
DatasetTypeResponse,
+ CreateDatasetRequest,
)
__all__ = [
@@ -21,4 +22,5 @@
"BatchUpdateFileTagsResponse",
"FileTagUpdateResult",
"FileTagUpdate",
+ "CreateDatasetRequest",
]
\ No newline at end of file
diff --git a/runtime/datamate-python/app/module/dataset/schema/dataset.py b/runtime/datamate-python/app/module/dataset/schema/dataset.py
index 84334d8c..8095857f 100644
--- a/runtime/datamate-python/app/module/dataset/schema/dataset.py
+++ b/runtime/datamate-python/app/module/dataset/schema/dataset.py
@@ -9,6 +9,7 @@ class DatasetType(Enum):
IMAGE = "IMAGE"
AUDIO = "AUDIO"
VIDEO = "VIDEO"
+ OTHER = "OTHER"
class DatasetTypeResponse(BaseModel):
"""数据集类型响应模型"""
@@ -18,6 +19,16 @@ class DatasetTypeResponse(BaseModel):
supportedFormats: List[str] = Field(default_factory=list, description="支持的文件格式")
icon: Optional[str] = Field(None, description="图标")
+class CreateDatasetRequest(BaseModel):
+ """创建数据集请求模型"""
+ name: str = Field(..., description="数据集名称", min_length=1, max_length=100)
+ description: Optional[str] = Field(None, description="数据集描述", max_length=500)
+ datasetType: DatasetType = Field(..., description="数据集类型", alias="datasetType")
+ tags: Optional[List[str]] = Field(None, description="标签列表")
+ dataSource: Optional[str] = Field(None, description="数据源")
+ retentionDays: Optional[int] = Field(None, description="保留天数")
+ status: Optional[str] = Field(None, description="数据集状态")
+
class DatasetResponse(BaseModel):
"""DM服务数据集响应模型"""
id: str = Field(..., description="数据集ID")
diff --git a/runtime/datamate-python/app/module/dataset/service/service.py b/runtime/datamate-python/app/module/dataset/service/service.py
index ff5869d7..5c1ddb4d 100644
--- a/runtime/datamate-python/app/module/dataset/service/service.py
+++ b/runtime/datamate-python/app/module/dataset/service/service.py
@@ -62,6 +62,84 @@ async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]:
logger.error(f"Failed to get dataset {dataset_id}: {e}")
return None
+ async def create_dataset(
+ self,
+ name: str,
+ dataset_type: str,
+ description: str = "",
+ status: Optional[str] = None,
+ ) -> DatasetResponse:
+ """
+ 创建数据集(参考Java版本DatasetApplicationService.createDataset)
+
+ Args:
+ name: 数据集名称
+ dataset_type: 数据集类型(TEXT/IMAGE/VIDEO/AUDIO/OTHER)
+ description: 数据集描述
+ status: 数据集状态
+
+ Returns:
+ 创建的数据集响应
+ """
+ try:
+ logger.info(f"Creating dataset: {name}, type: {dataset_type}")
+
+ # 1. 检查数据集名称是否已存在
+ result = await self.db.execute(
+ select(Dataset).where(Dataset.name == name)
+ )
+ existing_dataset = result.scalar_one_or_none()
+ if existing_dataset:
+ error_msg = f"Dataset with name '{name}' already exists"
+ logger.error(error_msg)
+ raise Exception(error_msg)
+
+ # 2. 创建数据集对象
+ dataset_id = str(uuid.uuid4())
+ dataset_path = f"{os.path.join('/dataset', dataset_id)}"
+
+ # 如果没有提供status,默认为DRAFT
+ if status is None:
+ status = "DRAFT"
+
+ new_dataset = Dataset(
+ id=dataset_id,
+ name=name,
+ description=description,
+ dataset_type=dataset_type,
+ path=dataset_path,
+ size_bytes=0,
+ file_count=0,
+ status=status,
+ dataset_metadata="{}",
+ version=0,
+ created_by="system",
+ )
+
+ self.db.add(new_dataset)
+ await self.db.flush()
+ await self.db.commit()
+
+ logger.info(f"Successfully created dataset: {new_dataset.id}")
+
+ return DatasetResponse(
+ id=new_dataset.id, # type: ignore
+ name=new_dataset.name, # type: ignore
+ description=new_dataset.description or "", # type: ignore
+ datasetType=new_dataset.dataset_type, # type: ignore
+ status=new_dataset.status, # type: ignore
+ fileCount=new_dataset.file_count or 0, # type: ignore
+ totalSize=new_dataset.size_bytes or 0, # type: ignore
+ createdAt=new_dataset.created_at, # type: ignore
+ updatedAt=new_dataset.updated_at, # type: ignore
+ createdBy=new_dataset.created_by # type: ignore
+ )
+
+ except Exception as e:
+ await self.db.rollback()
+ logger.error(f"Failed to create dataset: {e}")
+ raise Exception(f"Failed to create dataset: {str(e)}")
+
async def get_dataset_files(
self,
dataset_id: str,
diff --git a/runtime/datamate-python/app/module/operator/README.md b/runtime/datamate-python/app/module/operator/README.md
new file mode 100644
index 00000000..703e8ed3
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/README.md
@@ -0,0 +1,138 @@
+# Operator Market Service - Python Implementation
+
+## 概述
+
+这是 `operator-market-service` 的 Python 实现,已集成到 `runtime/datamate-python` 项目中。
+
+## 功能
+
+- **算子管理**:创建、查询、更新、删除算子
+- **分类管理**:树状分类结构查询
+- **文件上传**:支持算子文件上传和解析(支持 tar/zip 格式)
+- **MCP 工具集成**:通过 fastapi-mcp 提供 MCP 工具接口
+
+## 目录结构
+
+```
+app/module/operator_market/
+├── __init__.py # 模块入口
+├── constants.py # 常量定义
+├── exceptions.py # 异常定义
+├── schema/ # Pydantic Schema 定义
+│ ├── __init__.py
+│ ├── operator.py # 算子相关 Schema
+│ ├── category.py # 分类相关 Schema
+│ └── release.py # 发布版本 Schema
+├── parsers/ # 文件解析器
+│ ├── __init__.py
+│ ├── abstract_parser.py # 抽象解析器基类
+│ ├── tar_parser.py # TAR 文件解析器
+│ ├── zip_parser.py # ZIP 文件解析器
+│ └── parser_holder.py # 解析器持有者
+├── repository/ # 数据访问层
+│ ├── __init__.py
+│ ├── operator_repository.py
+│ ├── category_repository.py
+│ ├── category_relation_repository.py
+│ └── operator_release_repository.py
+├── service/ # 服务层
+│ ├── __init__.py
+│ ├── operator_service.py
+│ └── category_service.py
+└── interface/ # API 接口层
+ ├── __init__.py
+ ├── operator_routes.py
+ └── category_routes.py
+```
+
+## API 端点
+
+### 算子相关 (`/api/operator-market/operators`)
+
+| 方法 | 路径 | 描述 |
+|------|--------|------|
+| POST | `/list` | 查询算子列表(支持分页、分类过滤、关键词搜索) |
+| GET | `/{operator_id}` | 获取算子详情 |
+| PUT | `/{operator_id}` | 更新算子信息 |
+| POST | `/create` | 创建新算子 |
+| POST | `/upload` | 上传算子文件 |
+| POST | `/upload/pre-upload` | 预上传(获取请求 ID) |
+| POST | `/upload/chunk` | 分块上传 |
+| DELETE | `/{operator_id}` | 删除算子 |
+| GET | `/examples/download` | 下载示例算子 |
+
+### 分类相关 (`/api/operator-market/categories`)
+
+| 方法 | 路径 | 描述 |
+|------|--------|------|
+| GET | `/tree` | 获取分类树状结构 |
+
+## 数据库表
+
+- `t_operator` - 算子表
+- `t_operator_category` - 分类表
+- `t_operator_category_relation` - 分类关系表
+- `t_operator_release` - 算子发布版本表
+- `v_operator` - 算子视图(包含分类信息)
+
+## 文件格式支持
+
+算子文件需包含 `metadata.yml` 文件,格式如下:
+
+```yaml
+raw_id: "operator-id"
+name: "算子名称"
+description: "算子描述"
+version: "1.0.0"
+language: "python" # python, java
+modal: "text" # text, image, audio, video
+vendor: "datamate" # datamate, data-juicer, or other
+inputs: {...}
+outputs: {...}
+runtime: {...}
+settings: {...}
+metrics: {...}
+release:
+ - "更新日志1"
+ - "更新日志2"
+```
+
+## 待实现功能
+
+- [ ] 算子收藏功能完善
+- [ ] 标签过滤功能
+
+## 使用示例
+
+### 查询算子列表
+
+```bash
+curl -X POST "http://localhost:18000/api/operator-market/operators/list" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "page": 1,
+ "size": 10,
+ "keyword": "test",
+ "isStar": false
+ }'
+```
+
+### 获取分类树
+
+```bash
+curl -X GET "http://localhost:18000/api/operator-market/categories/tree"
+```
+
+### 创建算子
+
+```bash
+curl -X POST "http://localhost:18000/api/operator-market/operators/create" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "id": "new-operator-id",
+ "name": "新算子",
+ "description": "这是一个新算子",
+ "version": "1.0.0",
+ "fileName": "operator.tar"
+ }'
+```
diff --git a/runtime/datamate-python/app/module/operator/__init__.py b/runtime/datamate-python/app/module/operator/__init__.py
new file mode 100644
index 00000000..1ac84e31
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/__init__.py
@@ -0,0 +1,4 @@
+"""
+Operator Market Service Module
+算子市场服务模块
+"""
diff --git a/runtime/datamate-python/app/module/operator/constants.py b/runtime/datamate-python/app/module/operator/constants.py
new file mode 100644
index 00000000..e6d83ee9
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/constants.py
@@ -0,0 +1,50 @@
+"""
+Operator Market Constants
+算子市场常量定义
+"""
+
+# Service ID
+SERVICE_ID = "operator"
+
+# YAML metadata path
+YAML_PATH = "metadata.yml"
+
+# Example operator file path
+EXAMPLE_OPERATOR_PATH = "/app/test_operator.tar"
+
+# Category IDs
+CATEGORY_PYTHON = "python"
+CATEGORY_PYTHON_ID = "9eda9d5d-072b-499b-916c-797a0a8750e1"
+
+CATEGORY_JAVA = "java"
+CATEGORY_JAVA_ID = "b5bfc548-8ef6-417c-b8a6-a4197c078249"
+
+CATEGORY_CUSTOMIZED_ID = "ec2cdd17-8b93-4a81-88c4-ac9e98d10757"
+CATEGORY_TEXT_ID = "d8a5df7a-52a9-42c2-83c4-01062e60f597"
+CATEGORY_IMAGE_ID = "de36b61c-9e8a-4422-8c31-d30585c7100f"
+CATEGORY_AUDIO_ID = "42dd9392-73e4-458c-81ff-41751ada47b5"
+CATEGORY_VIDEO_ID = "a233d584-73c8-4188-ad5d-8f7c8dda9c27"
+CATEGORY_ALL_ID = "4d7dbd77-0a92-44f3-9056-2cd62d4a71e4"
+CATEGORY_STAR_ID = "51847c24-bba9-11f0-888b-5b143cb738aa"
+CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3"
+CATEGORY_DATAMATE_ID = "431e7798-5426-4e1a-aae6-b9905a836b34"
+CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996"
+CATEGORY_OTHER_VENDOR_ID = "f00eaa3e-96c1-4de4-96cd-9848ef5429ec"
+
+# Category mapping
+CATEGORY_MAP = {
+ CATEGORY_PYTHON: CATEGORY_PYTHON_ID,
+ CATEGORY_JAVA: CATEGORY_JAVA_ID,
+ "text": CATEGORY_TEXT_ID,
+ "image": CATEGORY_IMAGE_ID,
+ "audio": CATEGORY_AUDIO_ID,
+ "video": CATEGORY_VIDEO_ID,
+ "all": CATEGORY_ALL_ID,
+ "datamate": CATEGORY_DATAMATE_ID,
+ "data-juicer": CATEGORY_DATA_JUICER_ID,
+}
+
+# File paths
+OPERATOR_BASE_PATH = "/operators"
+UPLOAD_DIR = "upload"
+EXTRACT_DIR = "extract"
diff --git a/runtime/datamate-python/app/module/operator/exceptions.py b/runtime/datamate-python/app/module/operator/exceptions.py
new file mode 100644
index 00000000..6eca13f5
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/exceptions.py
@@ -0,0 +1,72 @@
+"""
+Operator Market Exceptions
+算子市场异常定义
+"""
+from enum import Enum
+from typing import Optional
+
+
+class OperatorErrorCode:
+ """算子错误码"""
+ def __init__(self, message: str, error_code: str):
+ self.message = message
+ self.error_code = error_code
+
+
+class OperatorException(RuntimeError):
+ """算子异常基类"""
+ def __init__(self, operator_error_code: OperatorErrorCode):
+ self.message = operator_error_code.message
+ self.error_code = operator_error_code.error_code
+ super().__init__(self.message)
+
+
+class OperatorErrorCodeEnum(Enum):
+ """算子错误码枚举"""
+ FIELD_NOT_FOUND = OperatorErrorCode(
+ "必填字段缺失", "OPERATOR_FIELD_NOT_FOUND"
+ )
+ SETTINGS_PARSE_FAILED = OperatorErrorCode(
+ "设置解析失败", "OPERATOR_SETTINGS_PARSE_FAILED"
+ )
+ OPERATOR_IN_INSTANCE = OperatorErrorCode(
+ "算子正在使用中", "OPERATOR_IN_INSTANCE"
+ )
+ CANT_DELETE_PREDEFINED_OPERATOR = OperatorErrorCode(
+ "无法删除预定义算子", "CANT_DELETE_PREDEFINED_OPERATOR"
+ )
+
+
+class FieldNotFoundError(OperatorException):
+ """必填字段缺失"""
+ def __init__(self, field_name: str):
+ super().__init__(
+ OperatorErrorCodeEnum.FIELD_NOT_FOUND.value
+ )
+ self.message = f"Required field '{field_name}' is missing"
+ self.field_name = field_name
+
+
+class SettingsParseError(OperatorException):
+ """设置解析失败"""
+ def __init__(self, detail: Optional[str] = None):
+ super().__init__(
+ OperatorErrorCodeEnum.SETTINGS_PARSE_FAILED.value
+ )
+ self.detail = detail
+
+
+class OperatorInInstanceError(OperatorException):
+ """算子正在使用中"""
+ def __init__(self):
+ super().__init__(
+ OperatorErrorCodeEnum.OPERATOR_IN_INSTANCE.value
+ )
+
+
+class CannotDeletePredefinedOperatorError(OperatorException):
+ """无法删除预定义算子"""
+ def __init__(self):
+ super().__init__(
+ OperatorErrorCodeEnum.CANT_DELETE_PREDEFINED_OPERATOR.value
+ )
diff --git a/runtime/datamate-python/app/module/operator/interface/__init__.py b/runtime/datamate-python/app/module/operator/interface/__init__.py
new file mode 100644
index 00000000..f83ad24f
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/interface/__init__.py
@@ -0,0 +1,9 @@
+"""
+Operator Market API Interfaces
+算子市场 API 接口层
+"""
+from .operator_routes import router as operator_router
+from .category_routes import router as category_router
+
+
+__all__ = ["operator_router", "category_router"]
diff --git a/runtime/datamate-python/app/module/operator/interface/category_routes.py b/runtime/datamate-python/app/module/operator/interface/category_routes.py
new file mode 100644
index 00000000..7483a5f0
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/interface/category_routes.py
@@ -0,0 +1,55 @@
+"""
+Category API Routes
+分类 API 路由
+"""
+from fastapi import APIRouter, Depends
+
+from app.db.models.operator import Category, CategoryRelation, Operator
+from app.db.session import get_db
+from app.module.operator.repository import (
+ CategoryRepository,
+ CategoryRelationRepository,
+)
+from app.module.operator.repository.operator_repository import OperatorRepository
+from app.module.operator.schema import CategoryTreePagedResponse
+from app.module.operator.schema.category import PaginatedCategoryTree
+from app.module.operator.service import CategoryService
+from app.module.shared.schema import StandardResponse
+
+router = APIRouter(prefix="/categories", tags=["Category"])
+
+
+def get_category_service() -> CategoryService:
+ """获取分类服务实例"""
+ return CategoryService(
+ category_repo=CategoryRepository(Category()),
+ category_relation_repo=CategoryRelationRepository(CategoryRelation()),
+ operator_repo=OperatorRepository(Operator()),
+ )
+
+
+@router.get(
+ "/tree",
+ response_model=StandardResponse[PaginatedCategoryTree],
+ summary="获取分类树",
+ description="获取算子树状分类结构,包含分组维度(如语言、模态)及资源统计数量",
+ tags=['mcp']
+)
+async def get_category_tree(
+ service: CategoryService = Depends(get_category_service),
+ db=Depends(get_db)
+):
+ """获取分类树"""
+ result = await service.get_all_categories(db)
+
+ return StandardResponse(
+ code="0",
+ message="success",
+ data=PaginatedCategoryTree(
+ page=0,
+ size=len(result.categories),
+ total_elements=len(result.categories),
+ total_pages=1,
+ star_count=result.star_count,
+ content=result.categories,
+ ))
diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py
new file mode 100644
index 00000000..4ae78f3a
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py
@@ -0,0 +1,249 @@
+"""
+Operator API Routes
+算子 API 路由
+"""
+from typing import Optional
+
+from fastapi import APIRouter, Depends, UploadFile, Form, File, Body
+from fastapi.responses import FileResponse
+
+from app.core.logging import get_logger
+from app.db.models.operator import Operator, CategoryRelation, OperatorRelease
+from app.db.session import get_db
+from app.module.operator.parsers import ParserHolder
+from app.module.operator.repository import (
+ OperatorRepository,
+ CategoryRelationRepository,
+ OperatorReleaseRepository,
+)
+from app.module.operator.schema import (
+ OperatorDto,
+ OperatorUpdateDto,
+ OperatorListRequest,
+)
+from app.module.operator.service import OperatorService
+from app.module.shared.chunk_upload_repository import ChunkUploadRepository
+from app.module.shared.file_service import FileService
+from app.module.shared.schema import StandardResponse, PaginatedData
+
+logger = get_logger(__name__)
+
+router = APIRouter(prefix="/operators", tags=["Operator"])
+
+
+def get_operator_service() -> OperatorService:
+ """获取算子服务实例"""
+ return OperatorService(
+ operator_repo=OperatorRepository(Operator()),
+ category_relation_repo=CategoryRelationRepository(CategoryRelation()),
+ operator_release_repo=OperatorReleaseRepository(OperatorRelease()),
+ parser_holder=ParserHolder(),
+ file_service=FileService(ChunkUploadRepository()),
+ )
+
+
+@router.post(
+ "/list",
+ response_model=StandardResponse[PaginatedData[OperatorDto]],
+ summary="查询算子列表",
+ description="根据参数查询算子列表(支持分页、分类过滤、关键词搜索)",
+ tags=['mcp']
+)
+async def list_operators(
+ request: OperatorListRequest,
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db),
+):
+ """查询算子列表"""
+ operators = await service.get_operators(
+ page=request.page,
+ size=request.size,
+ categories=request.categories,
+ keyword=request.keyword,
+ is_star=request.is_star,
+ db=db
+ )
+
+ count = await service.count_operators(
+ categories=request.categories,
+ keyword=request.keyword,
+ is_star=request.is_star,
+ db=db
+ )
+
+ total_pages = (count + request.size - 1) // request.size
+
+ return StandardResponse(
+ code="0",
+ message="success",
+ data=PaginatedData(
+ page=request.page,
+ size=request.size,
+ total_elements=count,
+ total_pages=total_pages,
+ content=operators,
+ )
+ )
+
+
+@router.get(
+ "/{operator_id}",
+ response_model=StandardResponse[OperatorDto],
+ summary="获取算子详情",
+ description="根据 ID 获取算子详细信息"
+)
+async def get_operator(
+ operator_id: str,
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db)
+):
+ """获取算子详情"""
+ operator = await service.get_operator_by_id(operator_id, db)
+ operator.file_name = None
+ return StandardResponse(code="0", message="success", data=operator)
+
+
+@router.put(
+ "/{operator_id}",
+ response_model=StandardResponse[OperatorDto],
+ summary="更新算子",
+ description="更新算子信息"
+)
+async def update_operator(
+ operator_id: str,
+ request: OperatorUpdateDto,
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db)
+):
+ """更新算子"""
+ operator = await service.update_operator(operator_id, request, db)
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=operator)
+
+
+@router.post(
+ "/create",
+ response_model=StandardResponse[OperatorDto],
+ summary="创建算子",
+ description="创建新算子"
+)
+async def create_operator(
+ request: OperatorDto,
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db)
+):
+ """创建算子"""
+ operator = await service.create_operator(request, db)
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=operator)
+
+
+@router.post(
+ "/upload",
+ response_model=StandardResponse[OperatorDto],
+ summary="上传算子",
+ description="上传算子文件并解析元数据"
+)
+async def upload_operator(
+ request: dict = Body(...),
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db),
+):
+ """上传算子"""
+ file_name = request.get("fileName")
+ if not file_name:
+ from fastapi import HTTPException
+ raise HTTPException(status_code=422, detail="fileName is required")
+ operator = await service.upload_operator(file_name, db)
+ return StandardResponse(code="0", message="success", data=operator)
+
+
+@router.post(
+ "/upload/pre-upload",
+ response_model=StandardResponse[str],
+ summary="预上传",
+ description="获取预上传 ID,用于分块上传"
+)
+async def pre_upload(
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db),
+):
+ """预上传"""
+ req_id = await service.pre_upload(db)
+ await db.commit()
+ return StandardResponse(
+ code="0",
+ message="success",
+ data=req_id,
+ )
+
+
+@router.post(
+ "/upload/chunk",
+ response_model=StandardResponse[dict],
+ summary="分块上传",
+ description="分块上传算子文件"
+)
+async def chunk_upload(
+ req_id: str = Form(..., alias="reqId", description="预上传ID"),
+ file_no: int = Form(1, alias="fileNo", description="文件编号"),
+ file_name: str = Form(..., alias="fileName", description="文件名"),
+ total_chunk_num: int = Form(1, alias="totalChunkNum", description="总分块数"),
+ chunk_no: int = Form(1, alias="chunkNo", description="当前分块号"),
+ file: UploadFile = File(...),
+ check_sum_hex: Optional[str] = Form(None, alias="checkSumHex", description="校验和"),
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db),
+):
+ """分块上传"""
+ file_content = await file.read()
+ result = await service.chunk_upload(
+ req_id=req_id,
+ file_no=file_no,
+ file_name=file_name,
+ total_chunk_num=total_chunk_num,
+ chunk_no=chunk_no,
+ check_sum_hex=check_sum_hex,
+ file_content=file_content,
+ db=db
+ )
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=result.dict())
+
+
+@router.delete(
+ "/{operator_id}",
+ response_model=StandardResponse[None],
+ summary="删除算子",
+ description="删除算子"
+)
+async def delete_operator(
+ operator_id: str,
+ service: OperatorService = Depends(get_operator_service),
+ db = Depends(get_db),
+):
+ """删除算子"""
+ await service.delete_operator(operator_id, db)
+ await db.commit()
+ return StandardResponse(code="0", message="success", data=None)
+
+
+@router.get(
+ "/examples/download",
+ response_class=FileResponse,
+ summary="下载示例算子",
+ description="下载示例算子文件"
+)
+async def download_example_operator(
+ service: OperatorService = Depends(get_operator_service),
+):
+ """下载示例算子"""
+ from app.module.operator.constants import EXAMPLE_OPERATOR_PATH
+
+ example_path = EXAMPLE_OPERATOR_PATH
+ file_path = service.download_example_operator(example_path)
+ return FileResponse(
+ path=str(file_path),
+ filename=file_path.name,
+ media_type="application/octet-stream"
+ )
diff --git a/runtime/datamate-python/app/module/operator/parsers/__init__.py b/runtime/datamate-python/app/module/operator/parsers/__init__.py
new file mode 100644
index 00000000..db3c0504
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/parsers/__init__.py
@@ -0,0 +1,15 @@
+"""
+Operator File Parsers
+算子文件解析器
+"""
+from .abstract_parser import AbstractParser
+from .tar_parser import TarParser
+from .zip_parser import ZipParser
+from .parser_holder import ParserHolder
+
+__all__ = [
+ "AbstractParser",
+ "TarParser",
+ "ZipParser",
+ "ParserHolder",
+]
diff --git a/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py
new file mode 100644
index 00000000..50ee98cf
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py
@@ -0,0 +1,118 @@
+"""
+Abstract Parser
+抽象解析器基类
+"""
+import json
+import yaml
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional
+
+from app.module.operator.schema import OperatorDto, OperatorReleaseDto
+from app.module.operator.constants import CATEGORY_MAP, CATEGORY_OTHER_VENDOR_ID, CATEGORY_CUSTOMIZED_ID
+from app.module.operator.exceptions import FieldNotFoundError
+
+
+class AbstractParser(ABC):
+ """算子文件解析器抽象基类"""
+
+ @abstractmethod
+ def parse_yaml_from_archive(
+ self,
+ archive_path: str,
+ entry_path: str,
+ file_name: Optional[str] = None,
+ file_size: Optional[int] = None
+ ) -> OperatorDto:
+ """
+ 从压缩包内读取指定路径的 yaml 文件并解析为 OperatorDto
+
+ Args:
+ archive_path: 压缩包路径(zip 或 tar)
+ entry_path: 压缩包内部的文件路径,例如 "config/app.yaml"
+
+ Returns:
+ 解析后的 OperatorDto
+ """
+ pass
+
+ @abstractmethod
+ def extract_to(self, archive_path: str, target_dir: str) -> None:
+ """
+ 将压缩包解压到目标目录(保持相对路径)
+
+ Args:
+ archive_path: 压缩包路径
+ target_dir: 目标目录
+ """
+ pass
+
+ def parse_yaml(
+ self,
+ yaml_content: str,
+ file_name: Optional[str] = None,
+ file_size: Optional[int] = None
+ ) -> OperatorDto:
+ """解析 YAML 内容为 OperatorDto"""
+ content: Dict[str, Any] = yaml.safe_load(yaml_content)
+
+ operator = OperatorDto(
+ id=self._to_string(content.get("raw_id")),
+ name=self._to_string(content.get("name")),
+ description=self._to_string(content.get("description")),
+ version=self._to_string(content.get("version")),
+ inputs=self._to_json(content.get("inputs")),
+ outputs=self._to_json(content.get("outputs")),
+ runtime=self._to_json(content.get("runtime")),
+ settings=self._to_json(content.get("settings")),
+ metrics=self._to_json(content.get("metrics")),
+ file_name=file_name,
+ file_size=file_size,
+ )
+
+ # Handle changelog
+ changelog = content.get("release")
+ if isinstance(changelog, list):
+ operator_release = OperatorReleaseDto(
+ id=operator.id,
+ version=operator.version,
+ changelog=changelog
+ )
+ else:
+ operator_release = OperatorReleaseDto(
+ id=operator.id,
+ version=operator.version,
+ changelog=[]
+ )
+ operator.releases = [operator_release]
+
+ # Build categories
+ categories = [
+ CATEGORY_MAP.get(self._to_lower(content.get("language")), ""),
+ CATEGORY_MAP.get(self._to_lower(content.get("modal")), ""),
+ CATEGORY_MAP.get(self._to_lower(content.get("vendor")), CATEGORY_OTHER_VENDOR_ID),
+ CATEGORY_CUSTOMIZED_ID,
+ ]
+ operator.categories = categories
+
+ return operator
+
+ def _to_string(self, obj: Any) -> str:
+ """转换为字符串"""
+ if obj is None:
+ raise FieldNotFoundError("field")
+ return str(obj)
+
+ def _to_lower(self, obj: Any) -> str:
+ """转换为小写字符串"""
+ if obj is None:
+ raise FieldNotFoundError("field")
+ return str(obj).lower()
+
+ def _to_json(self, obj: Any) -> Optional[str]:
+ """转换为 JSON 字符串"""
+ if obj is None:
+ return None
+ try:
+ return json.dumps(obj).strip('"').strip("'")
+ except (TypeError, ValueError) as e:
+ raise ValueError(f"Failed to serialize to JSON: {e}")
diff --git a/runtime/datamate-python/app/module/operator/parsers/parser_holder.py b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py
new file mode 100644
index 00000000..83522df4
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py
@@ -0,0 +1,59 @@
+"""
+Parser Holder
+解析器持有者,根据文件类型选择合适的解析器
+"""
+import os
+from typing import Dict, Type, Optional
+
+from app.module.operator.parsers.abstract_parser import AbstractParser
+from app.module.operator.parsers.tar_parser import TarParser
+from app.module.operator.parsers.zip_parser import ZipParser
+from app.module.operator.schema import OperatorDto
+
+
+class ParserHolder:
+ """解析器持有者,根据文件类型选择解析器"""
+
+ def __init__(self):
+ self._parsers: Dict[str, AbstractParser] = {
+ "tar": TarParser(),
+ "gz": TarParser(),
+ "tgz": TarParser(),
+ "zip": ZipParser(),
+ }
+
+ def get_parser(self, file_path: str) -> AbstractParser:
+ """根据文件扩展名获取解析器"""
+ _, ext = os.path.splitext(file_path)
+ file_type = ext.lstrip('.').lower()
+
+ if file_type not in self._parsers:
+ raise ValueError(f"Unsupported file type: {file_type}")
+
+ return self._parsers[file_type]
+
+ def parse_yaml_from_archive(
+ self,
+ file_type: str,
+ archive_path: str,
+ entry_path: str,
+ file_name: Optional[str] = None,
+ file_size: Optional[int] = None
+ ) -> OperatorDto:
+ """从压缩包解析 YAML"""
+ if file_type not in self._parsers:
+ raise ValueError(f"Unsupported file type: {file_type}")
+
+ return self._parsers[file_type].parse_yaml_from_archive(
+ archive_path,
+ entry_path,
+ file_name,
+ file_size
+ )
+
+ def extract_to(self, file_type: str, archive_path: str, target_dir: str) -> None:
+ """解压文件到目标目录"""
+ if file_type not in self._parsers:
+ raise ValueError(f"Unsupported file type: {file_type}")
+
+ self._parsers[file_type].extract_to(archive_path, target_dir)
diff --git a/runtime/datamate-python/app/module/operator/parsers/tar_parser.py b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py
new file mode 100644
index 00000000..9ce87f88
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py
@@ -0,0 +1,47 @@
+"""
+Tar File Parser
+TAR 文件解析器
+"""
+import tarfile
+import os
+from typing import Optional
+
+from app.module.operator.parsers.abstract_parser import AbstractParser
+from app.module.operator.schema import OperatorDto
+
+
+class TarParser(AbstractParser):
+ """TAR 压缩包解析器"""
+
+ def parse_yaml_from_archive(
+ self,
+ archive_path: str,
+ entry_path: str,
+ file_name: Optional[str] = None,
+ file_size: Optional[int] = None
+ ) -> OperatorDto:
+ """从 TAR 文件中解析 YAML"""
+ try:
+ with tarfile.open(archive_path, 'r:*') as tar:
+ for member in tar.getmembers():
+ if member.name == entry_path or member.name.endswith(f"/{entry_path}"):
+ file = tar.extractfile(member)
+ if file:
+ content = file.read().decode('utf-8')
+ return self.parse_yaml(content, file_name, file_size)
+ raise FileNotFoundError(f"File '{entry_path}' not found in archive")
+ except (tarfile.TarError, EOFError) as e:
+ raise ValueError(f"Failed to parse TAR file: {e}")
+
+ def extract_to(self, archive_path: str, target_dir: str) -> None:
+ """解压 TAR 文件到目标目录"""
+ try:
+ os.makedirs(target_dir, exist_ok=True)
+ with tarfile.open(archive_path, 'r:*') as tar:
+ # Safety check: prevent path traversal
+ for member in tar.getmembers():
+ if os.path.isabs(member.name) or ".." in member.name.split("/"):
+ raise ValueError(f"Unsafe path in archive: {member.name}")
+ tar.extractall(target_dir)
+ except (tarfile.TarError, EOFError) as e:
+ raise ValueError(f"Failed to extract TAR file: {e}")
diff --git a/runtime/datamate-python/app/module/operator/parsers/zip_parser.py b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py
new file mode 100644
index 00000000..db4a1b73
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py
@@ -0,0 +1,46 @@
+"""
+Zip File Parser
+ZIP 文件解析器
+"""
+import zipfile
+import os
+from typing import Optional
+
+from app.module.operator.parsers.abstract_parser import AbstractParser
+from app.module.operator.schema import OperatorDto
+
+
+class ZipParser(AbstractParser):
+ """ZIP 压缩包解析器"""
+
+ def parse_yaml_from_archive(
+ self,
+ archive_path: str,
+ entry_path: str,
+ file_name: Optional[str] = None,
+ file_size: Optional[int] = None
+ ) -> OperatorDto:
+ """从 ZIP 文件中解析 YAML"""
+ try:
+ with zipfile.ZipFile(archive_path, 'r') as zf:
+ for name in zf.namelist():
+ if name == entry_path or name.endswith(f"/{entry_path}"):
+ with zf.open(name) as file:
+ content = file.read().decode('utf-8')
+ return self.parse_yaml(content, file_name, file_size)
+ raise FileNotFoundError(f"File '{entry_path}' not found in archive")
+ except (zipfile.BadZipFile, zipfile.LargeZipFile) as e:
+ raise ValueError(f"Failed to parse ZIP file: {e}")
+
+ def extract_to(self, archive_path: str, target_dir: str) -> None:
+ """解压 ZIP 文件到目标目录"""
+ try:
+ os.makedirs(target_dir, exist_ok=True)
+ with zipfile.ZipFile(archive_path, 'r') as zf:
+ # Safety check: prevent path traversal
+ for name in zf.namelist():
+ if os.path.isabs(name) or ".." in name.split("/"):
+ raise ValueError(f"Unsafe path in archive: {name}")
+ zf.extractall(target_dir)
+ except (zipfile.BadZipFile, zipfile.LargeZipFile) as e:
+ raise ValueError(f"Failed to extract ZIP file: {e}")
diff --git a/runtime/datamate-python/app/module/operator/repository/__init__.py b/runtime/datamate-python/app/module/operator/repository/__init__.py
new file mode 100644
index 00000000..67859d72
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/repository/__init__.py
@@ -0,0 +1,15 @@
+"""
+Operator Market Repositories
+算子市场数据访问层
+"""
+from .operator_repository import OperatorRepository
+from .category_repository import CategoryRepository
+from .category_relation_repository import CategoryRelationRepository
+from .operator_release_repository import OperatorReleaseRepository
+
+__all__ = [
+ "OperatorRepository",
+ "CategoryRepository",
+ "CategoryRelationRepository",
+ "OperatorReleaseRepository",
+]
diff --git a/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py
new file mode 100644
index 00000000..b7de1e99
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py
@@ -0,0 +1,77 @@
+"""
+Category Relation Repository
+分类关系数据访问层
+"""
+from typing import List
+
+from sqlalchemy import select, delete, and_
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models.operator import CategoryRelation
+from app.module.operator.constants import CATEGORY_PREDEFINED_ID
+
+
+class CategoryRelationRepository:
+ """分类关系数据访问层"""
+
+ def __init__(self, model: CategoryRelation):
+ self.model = model
+
+ async def find_all(self, db: AsyncSession) -> List[CategoryRelation]:
+ """查询所有分类关系"""
+ result = await db.execute(select(CategoryRelation))
+ return result.scalars().all()
+
+ async def batch_insert(
+ self,
+ operator_id: str,
+ category_ids: List[str],
+ db: AsyncSession
+ ) -> None:
+ """批量插入分类关系"""
+ for category_id in category_ids:
+ entity = CategoryRelation(
+ category_id=category_id,
+ operator_id=operator_id
+ )
+ db.add(entity)
+
+ async def batch_update(
+ self,
+ operator_id: str,
+ category_ids: List[str],
+ db: AsyncSession
+ ) -> None:
+ """批量更新分类关系(先删除后插入)"""
+ # Delete existing relations
+ await db.execute(
+ delete(CategoryRelation)
+ .where(CategoryRelation.operator_id == operator_id)
+ )
+ # Insert new relations
+ for category_id in category_ids:
+ entity = CategoryRelation(
+ category_id=category_id,
+ operator_id=operator_id
+ )
+ db.add(entity)
+
+ async def delete_by_operator_id(self, operator_id: str, db: AsyncSession) -> None:
+ """根据算子ID删除分类关系"""
+ await db.execute(
+ delete(CategoryRelation)
+ .where(CategoryRelation.operator_id == operator_id)
+ )
+
+ async def operator_is_predefined(self, operator_id: str, db: AsyncSession) -> bool:
+ """检查算子是否为预定义算子"""
+ result = await db.execute(
+ select(CategoryRelation)
+ .where(
+ and_(
+ CategoryRelation.operator_id == operator_id,
+ CategoryRelation.category_id == CATEGORY_PREDEFINED_ID
+ )
+ )
+ )
+ return result.first() is not None
diff --git a/runtime/datamate-python/app/module/operator/repository/category_repository.py b/runtime/datamate-python/app/module/operator/repository/category_repository.py
new file mode 100644
index 00000000..76e472e6
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/repository/category_repository.py
@@ -0,0 +1,23 @@
+"""
+Category Repository
+分类数据访问层
+"""
+from typing import List
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models.operator import Category
+from app.module.operator.schema import CategoryDto
+
+
+class CategoryRepository:
+ """分类数据访问层"""
+
+ def __init__(self, model: Category):
+ self.model = model
+
+ async def find_all(self, db: AsyncSession) -> List[Category]:
+ """查询所有分类"""
+ result = await db.execute(select(Category))
+ return result.scalars().all()
diff --git a/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py
new file mode 100644
index 00000000..bcab7be8
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py
@@ -0,0 +1,72 @@
+"""
+Operator Release Repository
+算子发布版本数据访问层
+"""
+from typing import List
+
+from sqlalchemy import select, delete, and_
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models.operator import OperatorRelease
+from app.module.operator.schema import OperatorReleaseDto
+
+
+class OperatorReleaseRepository:
+ """算子发布版本数据访问层"""
+
+ def __init__(self, model: OperatorRelease):
+ self.model = model
+
+ async def find_all_by_operator_id(
+ self,
+ operator_id: str,
+ db: AsyncSession
+ ) -> List[OperatorRelease]:
+ """查询算子的所有发布版本"""
+ result = await db.execute(
+ select(OperatorRelease)
+ .where(OperatorRelease.id == operator_id)
+ .order_by(OperatorRelease.release_date.desc())
+ )
+ return result.scalars().all()
+
+ async def insert(
+ self,
+ dto: OperatorReleaseDto,
+ db: AsyncSession
+ ) -> None:
+ """插入发布版本"""
+ entity = OperatorRelease(
+ id=dto.id,
+ version=dto.version,
+ release_date=dto.release_date,
+ changelog=dto.changelog
+ )
+ db.add(entity)
+
+ async def update(
+ self,
+ dto: OperatorReleaseDto,
+ db: AsyncSession
+ ) -> None:
+ """更新发布版本"""
+ result = await db.execute(
+ select(OperatorRelease)
+ .where(
+ and_(
+ OperatorRelease.id == dto.id,
+ OperatorRelease.version == dto.version
+ )
+ )
+ )
+ entity = result.scalar_one_or_none()
+ if entity:
+ entity.changelog = dto.changelog
+ entity.release_date = dto.release_date
+
+ async def delete(self, operator_id: str, db: AsyncSession) -> None:
+ """删除算子的所有发布版本"""
+ await db.execute(
+ delete(OperatorRelease)
+ .where(OperatorRelease.id == operator_id)
+ )
diff --git a/runtime/datamate-python/app/module/operator/repository/operator_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_repository.py
new file mode 100644
index 00000000..990f7eb3
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/repository/operator_repository.py
@@ -0,0 +1,121 @@
+"""
+Operator Repository
+算子数据访问层
+"""
+import json
+from typing import List, Optional
+from datetime import datetime, timezone
+
+from sqlalchemy import select, text, update
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models.operator import Operator
+from app.module.operator.schema import OperatorDto
+
+
+class OperatorRepository:
+ """算子数据访问层"""
+
+ def __init__(self, model: Operator):
+ self.model = model
+
+ async def find_all(self, db: AsyncSession) -> List[Operator]:
+ """查询所有算子"""
+ result = await db.execute(select(Operator))
+ return result.scalars().all()
+
+ async def insert(self, dto: OperatorDto, db: AsyncSession) -> None:
+ """插入算子"""
+ entity = Operator(
+ id=dto.id,
+ name=dto.name,
+ description=dto.description,
+ version=dto.version,
+ inputs=dto.inputs,
+ outputs=dto.outputs,
+ runtime=dto.runtime,
+ settings=dto.settings,
+ file_name=dto.file_name,
+ file_size=dto.file_size,
+ metrics=dto.metrics,
+ usage_count=dto.usage_count or 0,
+ is_star=dto.is_star or False,
+ )
+ db.add(entity)
+
+ async def update(self, dto: OperatorDto, db: AsyncSession) -> None:
+ """更新算子"""
+ await db.execute(
+ update(Operator)
+ .where(Operator.id == dto.id)
+ .values(
+ name=dto.name,
+ description=dto.description,
+ version=dto.version,
+ inputs=dto.inputs,
+ outputs=dto.outputs,
+ runtime=dto.runtime,
+ settings=dto.settings,
+ file_name=dto.file_name,
+ file_size=dto.file_size,
+ metrics=dto.metrics,
+ is_star=dto.is_star,
+ updated_at=datetime.utcnow(),
+ )
+ )
+
+ async def delete(self, operator_id: str, db: AsyncSession) -> None:
+ """删除算子"""
+ entity = await db.get(Operator, operator_id)
+ if entity:
+ await db.delete(entity)
+
+ async def count_by_star(self, is_star: bool, db: AsyncSession) -> int:
+ """统计收藏算子数量"""
+ result = await db.execute(
+ select(text("COUNT(*)"))
+ .select_from(Operator)
+ .where(Operator.is_star == is_star)
+ )
+ return result.scalar() or 0
+
+ async def operator_in_template(self, operator_id: str, db: AsyncSession) -> bool:
+ """检查算子是否在模板中"""
+ result = await db.execute(
+ text("""
+ SELECT COUNT(*) FROM t_operator_instance oi
+ JOIN t_clean_template t ON oi.instance_id = t.id
+ WHERE oi.operator_id = :operator_id
+ """),
+ {"operator_id": operator_id}
+ )
+ return (result.scalar() or 0) > 0
+
+ async def operator_in_unstop_task(self, operator_id: str, db: AsyncSession) -> bool:
+ """检查算子是否在未完成的任务中"""
+ result = await db.execute(
+ text("""
+ SELECT COUNT(*) FROM t_operator_instance oi
+ JOIN t_clean_task t ON oi.instance_id = t.id
+ WHERE oi.operator_id = :operator_id AND t.status != 'COMPLETED'
+ """),
+ {"operator_id": operator_id}
+ )
+ return (result.scalar() or 0) > 0
+
+ async def increment_usage_count(
+ self,
+ operator_ids: List[str],
+ db: AsyncSession
+ ) -> None:
+ """增加算子使用次数"""
+ if not operator_ids:
+ return
+ await db.execute(
+ update(Operator)
+ .where(Operator.id.in_(operator_ids))
+ .values(
+ usage_count=Operator.usage_count + 1,
+ updated_at=datetime.now(timezone.utc),
+ )
+ )
diff --git a/runtime/datamate-python/app/module/operator/schema/__init__.py b/runtime/datamate-python/app/module/operator/schema/__init__.py
new file mode 100644
index 00000000..a084cbaf
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/schema/__init__.py
@@ -0,0 +1,29 @@
+"""
+Operator Market Schemas
+算子市场 Schema 定义
+"""
+from .operator import (
+ OperatorDto,
+ OperatorListRequest,
+ PreUploadResponse,
+ OperatorUpdateDto,
+)
+from .category import (
+ CategoryDto,
+ CategoryTreeResponse,
+ CategoryTreePagedResponse,
+ CategoryRelationDto,
+)
+from .release import OperatorReleaseDto
+
+__all__ = [
+ "OperatorDto",
+ "OperatorListRequest",
+ "PreUploadResponse",
+ "CategoryDto",
+ "CategoryTreeResponse",
+ "CategoryTreePagedResponse",
+ "CategoryRelationDto",
+ "OperatorReleaseDto",
+ "OperatorUpdateDto",
+]
diff --git a/runtime/datamate-python/app/module/operator/schema/category.py b/runtime/datamate-python/app/module/operator/schema/category.py
new file mode 100644
index 00000000..9de9dc59
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/schema/category.py
@@ -0,0 +1,44 @@
+"""
+Category Schemas
+分类 Schema 定义
+"""
+from typing import List, Optional
+from datetime import datetime
+from pydantic import BaseModel, Field
+
+from app.module.shared.schema import BaseResponseModel, PaginatedData
+
+
+class CategoryDto(BaseResponseModel):
+ """分类 DTO"""
+ id: str = Field(..., description="分类ID")
+ name: str = Field(..., description="分类名称")
+ value: Optional[str] = Field(None, description="分类值")
+ type: Optional[str] = Field(None, description="分类类型")
+ parent_id: Optional[str] = Field(None, description="父分类ID")
+ count: Optional[int] = Field(0, description="算子数量")
+ created_at: Optional[datetime] = Field(None, description="创建时间")
+
+
+class CategoryTreeResponse(BaseResponseModel):
+ """分类树响应"""
+ id: str = Field(..., description="分类ID")
+ name: str = Field(..., description="分类名称")
+ count: int = Field(0, description="算子总数")
+ categories: List[CategoryDto] = Field(default_factory=list, description="子分类列表")
+
+
+class CategoryTreePagedResponse(BaseResponseModel):
+ """分类树分页响应"""
+ star_count: int = Field(0, description="收藏的算子数量")
+ categories: List[CategoryTreeResponse] = Field(default_factory=list, description="分类树列表")
+
+
+class PaginatedCategoryTree(PaginatedData):
+ star_count: int = Field(0, description="收藏的算子数量")
+
+
+class CategoryRelationDto(BaseResponseModel):
+ """分类关系 DTO"""
+ category_id: str = Field(..., description="分类ID")
+ operator_id: str = Field(..., description="算子ID")
diff --git a/runtime/datamate-python/app/module/operator/schema/operator.py b/runtime/datamate-python/app/module/operator/schema/operator.py
new file mode 100644
index 00000000..f0868542
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/schema/operator.py
@@ -0,0 +1,72 @@
+"""
+Operator Schemas
+算子 Schema 定义
+"""
+from __future__ import annotations
+
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+from pydantic import BaseModel, Field
+
+from app.module.shared.schema import BaseResponseModel
+from .release import OperatorReleaseDto
+
+
+class OperatorDto(BaseResponseModel):
+ """算子 DTO"""
+ id: str = Field(..., description="算子ID")
+ name: str = Field(..., description="算子名称")
+ description: Optional[str] = Field(None, description="算子描述")
+ version: str = Field(..., description="算子版本")
+ inputs: Optional[str] = Field(None, description="输入定义(JSON)")
+ outputs: Optional[str] = Field(None, description="输出定义(JSON)")
+ runtime: Optional[str] = Field(None, description="运行时配置(JSON)")
+ settings: Optional[str] = Field(None, description="算子设置(JSON)")
+ file_name: Optional[str] = Field(None, description="文件名")
+ file_size: Optional[int] = Field(None, description="文件大小(字节)")
+ metrics: Optional[str] = Field(None, description="算子指标(JSON)")
+ usage_count: Optional[int] = Field(None, description="使用次数")
+ is_star: Optional[bool] = Field(None, description="是否收藏")
+ categories: Optional[List[str]] = Field(None, description="分类ID列表")
+ overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值")
+ requirements: Optional[List[str]] = Field(None, description="Python 依赖列表")
+ readme: Optional[str] = Field(None, description="README 内容")
+ releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表")
+ created_at: Optional[datetime] = Field(None, description="创建时间")
+ updated_at: Optional[datetime] = Field(None, description="更新时间")
+
+
+class OperatorListRequest(BaseResponseModel):
+ """算子列表查询请求"""
+ page: int = Field(1, ge=0, description="页码(从0开始)")
+ size: int = Field(10, ge=1, description="页大小")
+ categories: List[List[str]] = Field(default_factory=list, description="分类ID列表(每个父分类下的id放到一个列表中)")
+ keyword: Optional[str] = Field(None, description="搜索关键词")
+ label_name: Optional[str] = Field(None, description="标签名称(暂不支持)")
+ is_star: Optional[bool] = Field(None, description="是否收藏")
+
+
+class PreUploadResponse(BaseResponseModel):
+ """预上传响应"""
+ req_id: str = Field(..., description="请求ID")
+
+
+class OperatorUpdateDto(BaseResponseModel):
+ """算子更新 DTO(所有字段可选)"""
+ name: Optional[str] = Field(None, description="算子名称")
+ description: Optional[str] = Field(None, description="算子描述")
+ version: Optional[str] = Field(None, description="算子版本")
+ inputs: Optional[str] = Field(None, description="输入定义(JSON)")
+ outputs: Optional[str] = Field(None, description="输出定义(JSON)")
+ runtime: Optional[str] = Field(None, description="运行时配置(JSON)")
+ settings: Optional[str] = Field(None, description="算子设置(JSON)")
+ file_name: Optional[str] = Field(None, description="文件名")
+ file_size: Optional[int] = Field(None, description="文件大小(字节)")
+ metrics: Optional[str] = Field(None, description="算子指标(JSON)")
+ usage_count: Optional[int] = Field(None, description="使用次数")
+ is_star: Optional[bool] = Field(None, description="是否收藏")
+ categories: Optional[List[str]] = Field(None, description="分类ID列表")
+ overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值")
+ requirements: Optional[List[str]] = Field(None, description="Python 依赖列表")
+ readme: Optional[str] = Field(None, description="README 内容")
+ releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表")
diff --git a/runtime/datamate-python/app/module/operator/schema/release.py b/runtime/datamate-python/app/module/operator/schema/release.py
new file mode 100644
index 00000000..f91297ee
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/schema/release.py
@@ -0,0 +1,22 @@
+"""
+Operator Release Schemas
+算子发布版本 Schema 定义
+"""
+from __future__ import annotations
+
+from typing import List, Optional
+from datetime import datetime
+from pydantic import BaseModel, Field
+
+from app.module.shared.schema import BaseResponseModel
+
+
+class OperatorReleaseDto(BaseResponseModel):
+ """算子发布版本 DTO"""
+ id: str = Field(..., description="算子ID")
+ version: str = Field(..., description="版本号")
+ release_date: Optional[datetime] = Field(None, description="发布时间")
+ changelog: Optional[List[str]] = Field(None, description="更新日志列表")
+
+
+__all__ = ["OperatorReleaseDto"]
diff --git a/runtime/datamate-python/app/module/operator/service/__init__.py b/runtime/datamate-python/app/module/operator/service/__init__.py
new file mode 100644
index 00000000..3e1c1d0c
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/service/__init__.py
@@ -0,0 +1,11 @@
+"""
+Operator Market Services
+算子市场服务层
+"""
+from .operator_service import OperatorService
+from .category_service import CategoryService
+
+__all__ = [
+ "OperatorService",
+ "CategoryService",
+]
diff --git a/runtime/datamate-python/app/module/operator/service/category_service.py b/runtime/datamate-python/app/module/operator/service/category_service.py
new file mode 100644
index 00000000..c84a4906
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/service/category_service.py
@@ -0,0 +1,101 @@
+"""
+Category Service
+分类服务层
+"""
+from typing import List
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.module.operator.repository import (
+ CategoryRepository,
+ CategoryRelationRepository,
+)
+from app.module.operator.schema import (
+ CategoryDto,
+ CategoryTreeResponse,
+ CategoryTreePagedResponse,
+)
+from app.db.models.operator import Operator
+from app.module.operator.repository.operator_repository import OperatorRepository
+
+
+class CategoryService:
+ """分类服务"""
+
+ def __init__(
+ self,
+ category_repo: CategoryRepository,
+ category_relation_repo: CategoryRelationRepository,
+ operator_repo: OperatorRepository,
+ ):
+ self.category_repo = category_repo
+ self.category_relation_repo = category_relation_repo
+ self.operator_repo = operator_repo
+
+ async def get_all_categories(
+ self,
+ db: AsyncSession
+ ) -> CategoryTreePagedResponse:
+ """获取所有分类(树状结构)"""
+ # Get all categories
+ all_categories = await self.category_repo.find_all(db)
+ category_map = {c.id: c for c in all_categories}
+
+ # Get all relations and count operators per category
+ all_relations = await self.category_relation_repo.find_all(db)
+ relation_map = {}
+ for rel in all_relations:
+ if rel.category_id not in relation_map:
+ relation_map[rel.category_id] = 0
+ relation_map[rel.category_id] += 1
+
+ # Group by parent_id
+ grouped_by_parent = {}
+ for cat in all_categories:
+ if cat.parent_id != "0":
+ if cat.parent_id not in grouped_by_parent:
+ grouped_by_parent[cat.parent_id] = []
+ grouped_by_parent[cat.parent_id].append(cat)
+
+ # Build category trees
+ parent_ids = sorted(
+ grouped_by_parent.keys(),
+ key=lambda pid: pid
+ )
+
+ category_trees = []
+ for parent_id in parent_ids:
+ group = grouped_by_parent[parent_id]
+ parent_category = category_map[parent_id]
+
+ # Build DTOs for children
+ child_dtos = []
+ total_count = 0
+ for cat in sorted(group, key=lambda c: c.created_at or 0):
+ cat_dto = CategoryDto(
+ id=cat.id,
+ name=cat.name,
+ value=cat.value,
+ type=cat.type,
+ parent_id=cat.parent_id,
+ count=relation_map.get(cat.id, 0),
+ created_at=cat.created_at,
+ )
+ child_dtos.append(cat_dto)
+ total_count += cat_dto.count
+
+ tree = CategoryTreeResponse(
+ id=parent_id,
+ name=parent_category.name,
+ count=total_count,
+ categories=child_dtos,
+ )
+ category_trees.append(tree)
+
+ # Get star count
+ star_count = await self.operator_repo.count_by_star(True, db)
+
+ return CategoryTreePagedResponse(
+ star_count=star_count,
+ categories=category_trees,
+ )
diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py
new file mode 100644
index 00000000..6314f221
--- /dev/null
+++ b/runtime/datamate-python/app/module/operator/service/operator_service.py
@@ -0,0 +1,624 @@
+"""
+Operator Service
+算子服务层
+"""
+import json
+import os
+import uuid
+import shutil
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Dict, Any, TYPE_CHECKING
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, text, func
+
+from app.core.logging import get_logger
+from app.core.exception import BusinessError, ErrorCodes
+from app.module.operator.repository import (
+ OperatorRepository,
+ CategoryRelationRepository,
+ OperatorReleaseRepository,
+)
+from app.module.operator.schema import (
+ OperatorDto,
+ OperatorUpdateDto,
+ OperatorReleaseDto,
+)
+from app.module.operator.parsers import ParserHolder
+from app.module.operator.constants import (
+ OPERATOR_BASE_PATH,
+ UPLOAD_DIR,
+ EXTRACT_DIR,
+ YAML_PATH,
+ SERVICE_ID,
+)
+from app.module.shared.file_service import FileService
+from app.module.shared.file_models import (
+ ChunkUploadRequestDto,
+ FileUploadResult,
+)
+
+logger = get_logger(__name__)
+
+
+class OperatorService:
+ """算子服务"""
+
+ def __init__(
+ self,
+ operator_repo: OperatorRepository,
+ category_relation_repo: CategoryRelationRepository,
+ operator_release_repo: OperatorReleaseRepository,
+ parser_holder: ParserHolder,
+ file_service: FileService,
+ ):
+ self.operator_repo = operator_repo
+ self.category_relation_repo = category_relation_repo
+ self.operator_release_repo = operator_release_repo
+ self.parser_holder = parser_holder
+ self.file_service = file_service
+
+ async def get_operators(
+ self,
+ page: int,
+ size: int,
+ categories: List[List[str]],
+ keyword: Optional[str],
+ is_star: Optional[bool],
+ db: AsyncSession
+ ) -> List[OperatorDto]:
+ """查询算子列表(分页)"""
+ offset = page * size
+
+ # Build query with categories filter
+ conditions = []
+ params = {"limit": size, "offset": offset}
+
+ if is_star is not None:
+ conditions.append("ov.is_star = :is_star")
+ params["is_star"] = is_star
+
+ if keyword:
+ conditions.append(
+ "(ov.operator_name ILIKE :keyword OR ov.description ILIKE :keyword)"
+ )
+ params["keyword"] = f"%{keyword}%"
+
+ where_clause = ""
+ if conditions:
+ where_clause = "WHERE " + " AND ".join(conditions)
+
+ # Handle categories grouping
+ group_by = "GROUP BY ov.operator_id, ov.operator_name, ov.description, ov.version, " \
+ "ov.inputs, ov.outputs, ov.runtime, ov.settings, ov.is_star, " \
+ "ov.file_size, ov.usage_count, ov.created_at, ov.updated_at, ov.created_by, ov.updated_by"
+
+ having_clause = ""
+ if categories:
+ # Flatten all category IDs for IN clause
+ all_category_ids = [cat_id for sublist in categories for cat_id in sublist]
+ if all_category_ids:
+ where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)"
+ params["category_ids"] = all_category_ids
+
+ # Build HAVING clause for category groups
+ having_clauses = []
+ for i, cat_group in enumerate(categories):
+ cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group])
+ having_clauses.append(
+ f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0"
+ )
+ having_clause = "HAVING " + " AND ".join(having_clauses)
+
+ query = f"""
+ SELECT
+ ov.operator_id AS id,
+ ov.operator_name AS name,
+ ov.description,
+ ov.version,
+ ov.inputs,
+ ov.outputs,
+ ov.runtime,
+ ov.settings,
+ ov.is_star,
+ ov.file_size,
+ ov.usage_count,
+ ov.created_at,
+ ov.updated_at,
+ string_agg(ov.category_id, ',' ORDER BY ov.created_at DESC) AS categories
+ FROM v_operator ov
+ {where_clause}
+ {group_by}
+ {having_clause}
+ ORDER BY ov.created_at DESC
+ LIMIT :limit OFFSET :offset
+ """
+
+ result = await db.execute(text(query), params)
+ rows = result.fetchall()
+
+ # Convert to DTOs
+ operators = []
+ for row in rows:
+ categories_list = []
+ if row.categories:
+ categories_list = [cat_id for cat_id in row.categories.split(',') if cat_id]
+
+ operators.append(OperatorDto(
+ id=row.id,
+ name=row.name,
+ description=row.description,
+ version=row.version,
+ inputs=row.inputs,
+ outputs=row.outputs,
+ runtime=row.runtime,
+ settings=row.settings,
+ file_name=None,
+ file_size=row.file_size,
+ metrics=None,
+ usage_count=row.usage_count,
+ is_star=row.is_star,
+ categories=categories_list,
+ created_at=row.created_at,
+ updated_at=row.updated_at,
+ ))
+
+ return operators
+
+ async def count_operators(
+ self,
+ categories: List[List[str]],
+ keyword: Optional[str],
+ is_star: Optional[bool],
+ db: AsyncSession
+ ) -> int:
+ """统计算子数量"""
+ conditions = []
+ params = {}
+
+ if is_star is not None:
+ conditions.append("is_star = :is_star")
+ params["is_star"] = is_star
+
+ if keyword:
+ conditions.append(
+ "(operator_name ILIKE :keyword OR description ILIKE :keyword)"
+ )
+ params["keyword"] = f"%{keyword}%"
+
+ where_clause = ""
+ if conditions:
+ where_clause = "WHERE " + " AND ".join(conditions)
+
+ # Handle categories grouping
+ group_by = "GROUP BY operator_id, operator_name, description, version, inputs, outputs, " \
+ "runtime, settings, is_star, file_size, usage_count, created_at, updated_at, " \
+ "created_by, updated_by"
+
+ having_clause = ""
+ if categories:
+ # Flatten all category IDs for IN clause
+ all_category_ids = [cat_id for sublist in categories for cat_id in sublist]
+ if all_category_ids:
+ where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)"
+ params["category_ids"] = all_category_ids
+
+ # Build HAVING clause for category groups
+ having_clauses = []
+ for i, cat_group in enumerate(categories):
+ cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group])
+ having_clauses.append(
+ f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0"
+ )
+ having_clause = "HAVING " + " AND ".join(having_clauses)
+
+ query = f"""
+ SELECT COUNT(*) as count
+ FROM (
+ SELECT operator_id
+ FROM v_operator
+ {where_clause}
+ {group_by}
+ {having_clause}
+ ) AS t
+ """
+
+ result = await db.execute(text(query), params)
+ return result.scalar() or 0
+
+ async def get_operator_by_id(
+ self,
+ operator_id: str,
+ db: AsyncSession
+ ) -> OperatorDto:
+ """根据 ID 获取算子详情"""
+ result = await db.execute(
+ text("""
+ SELECT
+ operator_id, operator_name, description, version, inputs, outputs, runtime,
+ settings, is_star, file_name, file_size, usage_count, metrics,
+ created_at, updated_at, created_by, updated_by,
+ string_agg(category_name, ',' ORDER BY created_at DESC) AS categories
+ FROM v_operator
+ WHERE operator_id = :operator_id
+ GROUP BY operator_id, operator_name, description, version, inputs, outputs, runtime,
+ settings, is_star, file_name, file_size, usage_count, metrics,
+ created_at, updated_at, created_by, updated_by
+ """),
+ {"operator_id": operator_id}
+ )
+ row = result.fetchone()
+
+ if not row:
+ raise BusinessError(ErrorCodes.OPERATOR_NOT_FOUND, operator_id)
+
+ # Parse categories from comma-separated string
+ categories_str = row.categories if hasattr(row, 'categories') and row.categories else ""
+ categories = [c.strip() for c in categories_str.split(",")] if categories_str else []
+
+ # Build DTO
+ operator = OperatorDto(
+ id=row.operator_id,
+ name=row.operator_name,
+ description=row.description,
+ version=row.version,
+ inputs=row.inputs,
+ outputs=row.outputs,
+ runtime=row.runtime,
+ settings=row.settings,
+ file_name=row.file_name,
+ file_size=row.file_size,
+ metrics=row.metrics,
+ usage_count=row.usage_count,
+ is_star=row.is_star,
+ created_at=row.created_at,
+ updated_at=row.updated_at,
+ categories=categories,
+ )
+
+ # Read requirements and readme if file exists
+ if row.file_name:
+ extract_path = self._get_extract_path(
+ self._get_stem(row.file_name)
+ )
+ operator.requirements = self._read_requirements(extract_path)
+ operator.readme = self._get_readme_content(extract_path)
+
+ # Load releases
+ releases = await self.operator_release_repo.find_all_by_operator_id(
+ operator_id, db
+ )
+ operator.releases = [
+ OperatorReleaseDto(
+ id=release.id,
+ version=release.version,
+ release_date=release.release_date,
+ changelog=release.changelog
+ )
+ for release in releases
+ ]
+
+ return operator
+
+ async def create_operator(
+ self,
+ req: OperatorDto,
+ db: AsyncSession
+ ) -> OperatorDto:
+ """创建算子"""
+
+ # Generate ID if not provided
+ if not req.id:
+ req.id = str(uuid.uuid4())
+
+ # Override settings
+ self._override_settings(req)
+
+ # Insert operator
+ await self.operator_repo.insert(req, db)
+ await db.flush()
+
+ # Insert category relations
+ if req.categories:
+ await self.category_relation_repo.batch_insert(
+ req.id, req.categories, db
+ )
+
+ # Insert release
+ if req.releases:
+ release = req.releases[0]
+ release.id = req.id
+ release.version = req.version
+ release.release_date = datetime.now()
+ await self.operator_release_repo.insert(release, db)
+
+ # Extract files
+ if req.file_name:
+ self.parser_holder.extract_to(
+ self._get_file_type(req.file_name),
+ self._get_upload_path(req.file_name),
+ self._get_extract_path(self._get_stem(req.file_name))
+ )
+
+ return req
+
+ async def update_operator(
+ self,
+ operator_id: str,
+ req: OperatorUpdateDto,
+ db: AsyncSession
+ ) -> OperatorDto:
+ """更新算子"""
+
+ # Get existing operator
+ existing = await self.get_operator_by_id(operator_id, db)
+
+ # Save original version for release comparison
+ original_version = existing.version
+
+ # Merge update request into existing operator
+ # Only update fields that are provided (not None)
+ if req.name is not None:
+ existing.name = req.name
+ if req.description is not None:
+ existing.description = req.description
+ if req.version is not None:
+ existing.version = req.version
+ if req.inputs is not None:
+ existing.inputs = req.inputs
+ if req.outputs is not None:
+ existing.outputs = req.outputs
+ if req.runtime is not None:
+ existing.runtime = req.runtime
+ if req.settings is not None:
+ existing.settings = req.settings
+ if req.file_name is not None:
+ existing.file_name = req.file_name
+ if req.file_size is not None:
+ existing.file_size = req.file_size
+ if req.metrics is not None:
+ existing.metrics = req.metrics
+ if req.usage_count is not None:
+ existing.usage_count = req.usage_count
+ if req.is_star is not None:
+ existing.is_star = req.is_star
+ if req.categories is not None:
+ existing.categories = req.categories
+ if req.overrides is not None:
+ existing.overrides = req.overrides
+
+ # Override settings
+ self._override_settings(existing)
+
+ # Update operator
+ await self.operator_repo.update(existing, db)
+
+ # Update category relations
+ if req.file_name is not None and req.categories is not None:
+ await self.category_relation_repo.batch_update(
+ operator_id, req.categories, db
+ )
+
+ # Update release
+ if req.releases is not None and len(req.releases) > 0:
+ release = req.releases[0]
+ release.id = operator_id
+ release.version = req.version
+ release.release_date = datetime.now()
+ if original_version == release.version:
+ await self.operator_release_repo.update(release, db)
+ else:
+ await self.operator_release_repo.insert(release, db)
+
+ # Extract files
+ if req.file_name is not None:
+ self.parser_holder.extract_to(
+ self._get_file_type(req.file_name),
+ self._get_upload_path(req.file_name),
+ self._get_extract_path(self._get_stem(req.file_name))
+ )
+
+ await db.flush()
+ return await self.get_operator_by_id(operator_id, db)
+
+ async def delete_operator(
+ self,
+ operator_id: str,
+ db: AsyncSession
+ ) -> None:
+ """删除算子"""
+ # Check if operator is in use
+ in_template = await self.operator_repo.operator_in_template(operator_id, db)
+ in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db)
+ if in_template or in_unstop_task:
+ raise BusinessError(ErrorCodes.OPERATOR_IN_INSTANCE)
+
+ # Check if operator is predefined
+ is_predefined = await self.category_relation_repo.operator_is_predefined(
+ operator_id, db
+ )
+ if is_predefined:
+ raise BusinessError(ErrorCodes.OPERATOR_CANNOT_DELETE_PREDEFINED)
+
+ # Get operator for file cleanup
+ operator = await self.get_operator_by_id(operator_id, db)
+
+ # Delete from database
+ await self.operator_repo.delete(operator_id, db)
+ await self.category_relation_repo.delete_by_operator_id(operator_id, db)
+ await self.operator_release_repo.delete(operator_id, db)
+
+ # Delete extracted files
+ if operator.file_name:
+ extract_path = self._get_extract_path(self._get_stem(operator.file_name))
+ shutil.rmtree(extract_path, ignore_errors=True)
+
+ async def upload_operator(
+ self,
+ file_name: str,
+ db: AsyncSession
+ ) -> OperatorDto:
+ """上传算子文件并解析元数据"""
+ file_path = self._get_upload_path(file_name)
+ file_size = os.path.getsize(file_path) if os.path.exists(file_path) else None
+ return self.parser_holder.parse_yaml_from_archive(
+ self._get_file_type(file_name),
+ file_path,
+ YAML_PATH,
+ file_name,
+ file_size
+ )
+
+ async def pre_upload(self, db: AsyncSession) -> str:
+ """预上传,返回请求 ID"""
+ from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR
+
+ upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR)
+ req_id = await self.file_service.pre_upload(
+ upload_path=upload_path,
+ service_id=SERVICE_ID,
+ db_session=db,
+ check_info=None
+ )
+ return req_id
+
+ async def chunk_upload(
+ self,
+ req_id: str,
+ file_no: int,
+ file_name: str,
+ total_chunk_num: int,
+ chunk_no: int,
+ check_sum_hex: Optional[str],
+ file_content: bytes,
+ db: AsyncSession
+ ) -> FileUploadResult:
+ """分块上传文件"""
+ from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR
+
+ upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR)
+
+ chunk_request = ChunkUploadRequestDto(
+ req_id=req_id,
+ file_no=file_no,
+ file_name=file_name,
+ total_chunk_num=total_chunk_num,
+ chunk_no=chunk_no,
+ check_sum_hex=check_sum_hex,
+ )
+
+ return await self.file_service.chunk_upload(
+ chunk_request, upload_path, file_content, db
+ )
+
+ def download_example_operator(self, file_path: str) -> Path:
+ """下载示例算子文件"""
+ path = Path(file_path)
+ if not path.exists():
+ raise FileNotFoundError(f"File not found: {file_path}")
+ return path
+
+ def _override_settings(self, operator: OperatorDto) -> None:
+ """用 overrides 值覆盖 settings 的 defaultVal"""
+ if not operator.settings or not operator.overrides:
+ return
+
+ try:
+ settings = json.loads(operator.settings)
+ for key, value in operator.overrides.items():
+ if key not in settings:
+ continue
+
+ setting = settings[key]
+ setting_type = setting.get("type")
+
+ match setting_type:
+ case "slider" | "switch" | "select" | "input" | "radio":
+ setting["defaultVal"] = value
+ case "checkbox":
+ setting["defaultVal"] = self._convert_to_list_string(value)
+ case "range":
+ self._update_properties(setting, value)
+
+ settings[key] = setting
+
+ operator.settings = json.dumps(settings)
+ except json.JSONDecodeError as e:
+ raise BusinessError(ErrorCodes.OPERATOR_PARSE_FAILED, str(e))
+
+ def _convert_to_list_string(self, value: Any) -> str:
+ """转换为逗号分隔的字符串"""
+ if value is None:
+ return ""
+ if isinstance(value, list):
+ return ",".join(str(v) for v in value)
+ return str(value)
+
+ def _update_properties(self, setting: Dict[str, Any], value: Any) -> None:
+ """更新 range 类型的 properties"""
+ if not isinstance(value, list):
+ return
+
+ properties = setting.get("properties", [])
+ if not isinstance(properties, list) or len(properties) != len(value):
+ return
+
+ for i, prop in enumerate(properties):
+ if isinstance(prop, dict):
+ prop["defaultVal"] = value[i]
+
+ setting["properties"] = properties
+
+ def _read_requirements(self, extract_path: str) -> List[str]:
+ """读取 requirements.txt"""
+ requirements_path = Path(extract_path) / "requirements.txt"
+ if not requirements_path.exists():
+ return []
+
+ requirements = []
+ try:
+ with open(requirements_path, 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip()
+ if line and not line.startswith('#'):
+ requirements.append(line)
+ except Exception as e:
+ logger.warning(f"Failed to read requirements: {e}")
+ return requirements
+
+ def _get_readme_content(self, extract_path: str) -> str:
+ """读取 README 内容"""
+ dir_path = Path(extract_path)
+ if not dir_path.exists() or not dir_path.is_dir():
+ logger.info(f"Directory does not exist or is not a directory: {extract_path}")
+ return ""
+
+ candidates = ["README.md", "readme.md", "Readme.md"]
+ for filename in candidates:
+ readme_path = dir_path / filename
+ if readme_path.exists() and readme_path.is_file():
+ try:
+ content = readme_path.read_text(encoding='utf-8')
+ logger.info(f"Successfully read README from: {readme_path}")
+ return content
+ except Exception as e:
+ logger.warning(f"Failed to read README from {readme_path}: {e}")
+ logger.info(f"No README found in: {extract_path}")
+ return ""
+
+ def _get_file_type(self, file_name: str) -> str:
+ """获取文件类型(扩展名)"""
+ return file_name.rsplit('.', 1)[-1].lower() if '.' in file_name else ""
+
+ def _get_stem(self, file_name: str) -> str:
+ """获取文件名不含扩展名"""
+ return file_name.rsplit('.', 1)[0] if '.' in file_name else file_name
+
+ def _get_upload_path(self, file_name: str) -> str:
+ """获取上传文件路径"""
+ return os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR, file_name)
+
+ def _get_extract_path(self, file_stem: str) -> str:
+ """获取解压路径"""
+ return os.path.join(OPERATOR_BASE_PATH, EXTRACT_DIR, file_stem)
diff --git a/runtime/datamate-python/app/module/shared/__init__.py b/runtime/datamate-python/app/module/shared/__init__.py
index e69de29b..fd0d7a1a 100644
--- a/runtime/datamate-python/app/module/shared/__init__.py
+++ b/runtime/datamate-python/app/module/shared/__init__.py
@@ -0,0 +1,21 @@
+"""
+Shared Module Init
+共享模块初始化
+"""
+from .file_service import FileService
+from .file_models import (
+ ChunkUploadPreRequestDto,
+ ChunkUploadRequestDto,
+ FileUploadResult,
+)
+from .chunks_saver import ChunksSaver
+from .chunk_upload_repository import ChunkUploadRepository
+
+__all__ = [
+ "FileService",
+ "ChunkUploadPreRequestDto",
+ "ChunkUploadRequestDto",
+ "FileUploadResult",
+ "ChunksSaver",
+ "ChunkUploadRepository",
+]
diff --git a/runtime/datamate-python/app/module/shared/chunk_upload_repository.py b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py
new file mode 100644
index 00000000..8a0c717d
--- /dev/null
+++ b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py
@@ -0,0 +1,95 @@
+"""
+Chunk Upload Repository
+分片上传数据访问层
+"""
+from typing import Optional, List
+
+from sqlalchemy import select, update, delete
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models.chunk_upload import ChunkUploadPreRequest
+from app.core.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class ChunkUploadRepository:
+ """分片上传数据访问层"""
+
+ async def find_by_id(
+ self,
+ req_id: str,
+ db: AsyncSession
+ ) -> Optional[ChunkUploadPreRequest]:
+ """根据ID查询"""
+ result = await db.execute(
+ select(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id)
+ )
+ return result.scalar_one_or_none()
+
+ async def find_by_service_id(
+ self,
+ service_id: str,
+ db: AsyncSession
+ ) -> List[ChunkUploadPreRequest]:
+ """根据服务ID查询"""
+ result = await db.execute(
+ select(ChunkUploadPreRequest).where(
+ ChunkUploadPreRequest.service_id == service_id
+ )
+ )
+ return result.scalars().all()
+
+ async def find_all(self, db: AsyncSession) -> List[ChunkUploadPreRequest]:
+ """查询所有"""
+ result = await db.execute(select(ChunkUploadPreRequest))
+ return result.scalars().all()
+
+ async def insert(
+ self,
+ request: ChunkUploadPreRequest,
+ db: AsyncSession
+ ) -> None:
+ """插入"""
+ db.add(request)
+
+ async def update(
+ self,
+ request: ChunkUploadPreRequest,
+ db: AsyncSession
+ ) -> int:
+ """更新"""
+ from datetime import datetime, timezone
+ result = await db.execute(
+ update(ChunkUploadPreRequest)
+ .where(ChunkUploadPreRequest.id == request.id)
+ .values(
+ uploaded_file_num=request.uploaded_file_num,
+ timeout=request.timeout,
+ )
+ )
+ return result.rowcount
+
+ async def delete_by_id(
+ self,
+ req_id: str,
+ db: AsyncSession
+ ) -> int:
+ """根据ID删除"""
+ result = await db.execute(
+ delete(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id)
+ )
+ return result.rowcount
+
+ async def delete_by_service_id(
+ self,
+ service_id: str,
+ db: AsyncSession
+ ) -> int:
+ """根据服务ID删除"""
+ result = await db.execute(
+ delete(ChunkUploadPreRequest).where(
+ ChunkUploadPreRequest.service_id == service_id
+ )
+ )
+ return result.rowcount
diff --git a/runtime/datamate-python/app/module/shared/chunks_saver.py b/runtime/datamate-python/app/module/shared/chunks_saver.py
new file mode 100644
index 00000000..554b263b
--- /dev/null
+++ b/runtime/datamate-python/app/module/shared/chunks_saver.py
@@ -0,0 +1,146 @@
+"""
+Chunks Saver
+分片保存器,用于处理文件分片上传
+"""
+import os
+from pathlib import Path
+from typing import Optional
+from datetime import datetime, timezone
+
+from fastapi import UploadFile
+
+from app.core.logging import get_logger
+from app.module.shared.file_models import ChunkUploadRequestDto
+
+logger = get_logger(__name__)
+
+
+class ChunksSaver:
+ """分片保存器"""
+
+ TEMP_DIR_NAME_FORMAT = "req_%s_chunks"
+
+ @staticmethod
+ def save(
+ file_upload_request: ChunkUploadRequestDto,
+ pre_upload_req_id: str,
+ upload_path: str,
+ file_content: bytes
+ ) -> Optional[Path]:
+ """
+ 保存分片
+
+ Args:
+ file_upload_request: 上传分片的请求
+ pre_upload_req_id: 预上传请求ID
+ upload_path: 上传基础路径
+ file_content: 文件内容(字节)
+
+ Returns:
+ 保存后的文件路径,如果不是最后一个分片则返回None
+ """
+ start_time = datetime.now(timezone.utc)
+
+ temp_dir = Path(upload_path) / (
+ ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_upload_req_id
+ )
+ temp_dir.mkdir(parents=True, exist_ok=True)
+
+ temp_file = temp_dir / str(file_upload_request.file_no)
+
+ ChunksSaver._append_to_target_file(temp_file, file_content)
+
+ if file_upload_request.total_chunk_num != file_upload_request.chunk_no:
+ elapsed = (datetime.now(timezone.utc) - start_time).total_seconds()
+ logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s")
+ return None
+
+ final_file = Path(upload_path) / file_upload_request.file_name
+
+ try:
+ temp_file.rename(final_file)
+ except OSError as e:
+ logger.error(
+ f"failed to mv file: {temp_file.name}, req id: {pre_upload_req_id}, error: {e}"
+ )
+ raise ValueError("failed to move file to target dir") from e
+
+ elapsed = (datetime.now(timezone.utc) - start_time).total_seconds()
+ logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s")
+
+ return final_file
+
+ @staticmethod
+ def save_file(
+ file_upload_request: ChunkUploadRequestDto,
+ upload_path: str,
+ file_content: bytes
+ ) -> Path:
+ """
+ 保存文件(不分片)
+
+ Args:
+ file_upload_request: 上传请求
+ upload_path: 上传路径
+ file_content: 文件内容(字节)
+
+ Returns:
+ 保存后的文件路径
+ """
+ target_file = Path(upload_path) / file_upload_request.file_name
+
+ logger.info(f"file path {target_file}, file size {len(file_content)}")
+
+ try:
+ target_file.parent.mkdir(parents=True, exist_ok=True)
+ target_file.write_bytes(file_content)
+ except OSError as e:
+ logger.error(f"failed to save file: {target_file}, error: {e}")
+ raise ValueError("failed to save file") from e
+
+ return target_file
+
+ @staticmethod
+ def delete_folder(folder_path: str) -> None:
+ """
+ 删除指定路径下的所有文件
+
+ Args:
+ folder_path: 文件夹路径
+ """
+ folder = Path(folder_path)
+
+ if not folder.exists():
+ logger.info(f"folder {folder_path} does not exist")
+ return
+
+ try:
+ for item in folder.glob("*"):
+ if item.is_file():
+ item.unlink()
+ elif item.is_dir():
+ for sub_item in item.glob("*"):
+ if sub_item.is_file():
+ sub_item.unlink()
+ elif sub_item.is_dir():
+ ChunksSaver.delete_folder(str(sub_item))
+ item.rmdir()
+ except OSError as e:
+ logger.error(f"failed to delete folder: {folder_path}, error: {e}")
+ raise ValueError("failed to delete folder") from e
+
+ @staticmethod
+ def _append_to_target_file(target_file: Path, content: bytes) -> None:
+ """
+ 追加内容到目标文件末尾
+
+ Args:
+ target_file: 目标文件
+ content: 要追加的内容
+ """
+ try:
+ with open(target_file, "ab") as f:
+ f.write(content)
+ except OSError as e:
+ logger.error(f"failed to append to file: {target_file}, error: {e}")
+ raise ValueError("failed to append content to file") from e
diff --git a/runtime/datamate-python/app/module/shared/file_models.py b/runtime/datamate-python/app/module/shared/file_models.py
new file mode 100644
index 00000000..c4e98775
--- /dev/null
+++ b/runtime/datamate-python/app/module/shared/file_models.py
@@ -0,0 +1,38 @@
+"""
+File Models
+文件相关模型定义
+"""
+from pathlib import Path
+from typing import Optional
+from pydantic import BaseModel, Field
+from datetime import datetime
+
+
+class ChunkUploadPreRequestDto(BaseModel):
+ """分片上传预请求DTO"""
+ id: str = Field(..., description="请求ID")
+ total_file_num: int = Field(..., description="总文件数", ge=1)
+ uploaded_file_num: Optional[int] = Field(None, description="已上传文件数", ge=0)
+ upload_path: str = Field(..., description="文件路径")
+ timeout: Optional[datetime] = Field(None, description="上传请求超时时间")
+ service_id: Optional[str] = Field(None, description="上传请求所属服务ID")
+ check_info: Optional[str] = Field(None, description="业务信息")
+
+
+class ChunkUploadRequestDto(BaseModel):
+ """分片上传请求DTO"""
+ req_id: str = Field(..., description="预上传返回的ID")
+ file_no: int = Field(1, description="文件编号", ge=1)
+ file_name: str = Field(..., description="文件名称")
+ total_chunk_num: int = Field(1, description="总分块数量", ge=1)
+ chunk_no: int = Field(1, description="当前分块编号", ge=1)
+ file_size: Optional[int] = Field(None, description="文件大小", ge=0)
+ check_sum_hex: Optional[str] = Field(None, description="文件校验和(十六进制字符串)")
+
+
+class FileUploadResult(BaseModel):
+ """文件上传结果"""
+ is_all_files_uploaded: bool = Field(..., description="是否所有文件已上传")
+ check_info: Optional[str] = Field(None, description="业务上传信息")
+ saved_file_path: Optional[str] = Field(None, description="保存的文件路径")
+ file_name: str = Field(..., description="文件名称")
diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py
new file mode 100644
index 00000000..1a858587
--- /dev/null
+++ b/runtime/datamate-python/app/module/shared/file_service.py
@@ -0,0 +1,183 @@
+"""
+File Service
+文件服务,处理文件上传、分片上传等功能
+"""
+import os
+import uuid
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional
+
+from app.core.logging import get_logger
+from app.db.models.chunk_upload import ChunkUploadPreRequest
+from app.module.shared.chunk_upload_repository import ChunkUploadRepository
+from app.module.shared.chunks_saver import ChunksSaver
+from app.module.shared.file_models import (
+ ChunkUploadRequestDto,
+ FileUploadResult,
+)
+
+logger = get_logger(__name__)
+
+
+class FileService:
+ """文件服务"""
+
+ DEFAULT_TIMEOUT_SECONDS = 120
+
+ def __init__(
+ self,
+ chunk_upload_repo: ChunkUploadRepository,
+ ):
+ self.chunk_upload_repo = chunk_upload_repo
+
+ async def pre_upload(
+ self,
+ upload_path: str,
+ service_id: str,
+ db_session,
+ check_info: Optional[str] = None
+ ) -> str:
+ """
+ 预上传
+
+ Args:
+ upload_path: 上传路径
+ service_id: 服务ID
+ check_info: 业务信息
+
+ Returns:
+ 预上传请求ID
+ """
+ req_id = str(uuid.uuid4())
+ timeout = datetime.utcnow().replace(
+ microsecond=0
+ ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS)
+
+ pre_request = ChunkUploadPreRequest(
+ id=req_id,
+ total_file_num=1,
+ uploaded_file_num=0,
+ upload_path=upload_path,
+ timeout=timeout,
+ service_id=service_id,
+ check_info=check_info,
+ )
+
+ await self.chunk_upload_repo.insert(pre_request, db_session)
+ return req_id
+
+ async def chunk_upload(
+ self,
+ upload_request: ChunkUploadRequestDto,
+ upload_path: str,
+ file_content: bytes,
+ db_session,
+ ) -> FileUploadResult:
+ """
+ 分片上传
+
+ Args:
+ upload_request: 上传请求
+ upload_path: 上传路径
+ file_content: 文件内容
+ db_session: 数据库会话
+
+ Returns:
+ 上传结果
+ """
+ upload_request.file_size = len(file_content)
+
+ pre_request = await self.chunk_upload_repo.find_by_id(
+ upload_request.req_id, db_session
+ )
+
+ if pre_request is None:
+ logger.error(f"pre-upload request not found: {upload_request.req_id}")
+ raise ValueError("Pre-upload request not found")
+
+ if pre_request.is_upload_complete():
+ logger.error(f"upload already complete: {upload_request.req_id}")
+ raise ValueError("Upload already complete")
+
+ if pre_request.is_request_timeout():
+ logger.error(f"upload request timeout: {upload_request.req_id}")
+ raise ValueError("Upload request timeout")
+
+ saved_file_path = None
+
+ if upload_request.total_chunk_num > 1:
+ saved_file_path = await self._upload_chunk(
+ upload_request, pre_request, upload_path, file_content
+ )
+ else:
+ saved_file_path = await self._upload_file(
+ upload_request, pre_request, upload_path, file_content
+ )
+
+ update_count = await self.chunk_upload_repo.update(pre_request, db_session)
+
+ if update_count == 0:
+ logger.error(f"failed to update pre-request: {upload_request.req_id}")
+ raise ValueError("Failed to update pre-upload request")
+
+ is_finish = pre_request.uploaded_file_num == pre_request.total_file_num
+
+ if is_finish:
+ temp_dir = os.path.join(
+ upload_path,
+ ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_request.id
+ )
+ try:
+ ChunksSaver.delete_folder(temp_dir)
+ except Exception as e:
+ logger.warning(f"failed to delete temp dir: {temp_dir}, error: {e}")
+
+ await self.chunk_upload_repo.delete_by_id(pre_request.id, db_session)
+
+ return FileUploadResult(
+ is_all_files_uploaded=is_finish,
+ check_info=pre_request.check_info,
+ saved_file_path=str(saved_file_path) if saved_file_path else None,
+ file_name=upload_request.file_name,
+ )
+
+ async def _upload_file(
+ self,
+ upload_request: ChunkUploadRequestDto,
+ pre_request: ChunkUploadPreRequest,
+ upload_path: str,
+ file_content: bytes
+ ) -> Path:
+ """上传单文件"""
+ saved_file = ChunksSaver.save_file(
+ upload_request, upload_path, file_content
+ )
+
+ pre_request.timeout = datetime.utcnow().replace(
+ microsecond=0
+ ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS)
+ pre_request.increment_uploaded_file_num()
+
+ return saved_file
+
+ async def _upload_chunk(
+ self,
+ upload_request: ChunkUploadRequestDto,
+ pre_request: ChunkUploadPreRequest,
+ upload_path: str,
+ file_content: bytes
+ ) -> Optional[Path]:
+ """上传分片"""
+ saved_file = ChunksSaver.save(
+ upload_request, pre_request.id, upload_path, file_content
+ )
+
+ if saved_file is not None:
+ pre_request.increment_uploaded_file_num()
+ return saved_file
+
+ pre_request.timeout = datetime.utcnow().replace(
+ microsecond=0
+ ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS)
+ return None
diff --git a/runtime/ops/examples/test_operator/metadata.yml b/runtime/ops/examples/test_operator/metadata.yml
index 2320c9ed..fb1b59b8 100644
--- a/runtime/ops/examples/test_operator/metadata.yml
+++ b/runtime/ops/examples/test_operator/metadata.yml
@@ -22,8 +22,8 @@ metrics:
runtime:
memory: 10485760
cpu: 0.05
- gpu: 0.1
- npu: 0.1
+ gpu: 0
+ npu: 0
settings:
sliderParam:
name: '滑窗测试'
diff --git a/runtime/ops/examples/test_operator/test_operator.tar b/runtime/ops/examples/test_operator/test_operator.tar
index dc986c1d..e14771ea 100644
Binary files a/runtime/ops/examples/test_operator/test_operator.tar and b/runtime/ops/examples/test_operator/test_operator.tar differ
diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml
index dd8271d1..11d6bb11 100644
--- a/runtime/ops/pyproject.toml
+++ b/runtime/ops/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
"openslide-python>=1.4.3",
"paddleocr==3.3.0",
"paddlepaddle==3.2.2",
- "pandas>=2.2.3",
+ "pandas>=2.2.3,<3.0.0",
"presidio-analyzer==2.2.25",
"presidio-anonymizer==2.2.25",
"pycryptodome>=3.23.0",
diff --git a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py
index d1a57125..6d345f4b 100644
--- a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py
+++ b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py
@@ -14,6 +14,7 @@
from datamate.core.base_op import FileExporter, SUCCESS_STATUS
from datamate.core.constant import Fields
from datamate.wrappers.executor import RayExecutor
+from datamate.sql_manager.persistence_atction import TaskInfoPersistence
DJ_OUTPUT = "outputs"
@@ -103,6 +104,10 @@ def run(self):
logger.info('Read data...')
dataset = dataset.map(FileExporter().read_file, num_cpus=0.05)
+ # 保存原始数据文件ID集合,用于后续过滤数据检测
+ original_file_ids = set(dataset.unique("fileId"))
+
+ # 写入数据集文件
with open(self.dataset_path, "w", encoding="utf-8") as f:
for batch_df in dataset.iter_batches(batch_format="pandas", batch_size=2048):
batch_df.to_json(f, orient="records", lines=True, force_ascii=False)
@@ -118,6 +123,26 @@ def run(self):
processed_dataset = processed_dataset.map(FileExporter().save_file_and_db, num_cpus=0.05)
for _ in processed_dataset.iter_batches():
pass
+
+ # 特殊处理:识别被过滤的数据
+ if processed_dataset.count() == 0:
+ processed_file_ids = set()
+ else:
+ processed_file_ids = set(processed_dataset.unique("fileId"))
+ filtered_file_ids = original_file_ids - processed_file_ids
+
+ if filtered_file_ids:
+ logger.info(f"Found {len(filtered_file_ids)} filtered files, updating task result only")
+ for sample_dict in dataset.iter_batches(batch_format="pandas", batch_size=2048):
+ for _, row in sample_dict.iterrows():
+ if str(row.get("fileId", "")) in filtered_file_ids:
+ row["fileSize"] = "0"
+ row["fileType"] = ""
+ row["execute_status"] = SUCCESS_STATUS
+ row[Fields.instance_id] = self.cfg.instance_id
+ TaskInfoPersistence().update_task_result(row)
+
+ self.scan_files()
except Exception as e:
logger.error(f"An unexpected error occurred.", e)
raise e
diff --git a/scripts/db/data-cleaning-init.sql b/scripts/db/data-cleaning-init.sql
index 93322f44..2e0501c9 100644
--- a/scripts/db/data-cleaning-init.sql
+++ b/scripts/db/data-cleaning-init.sql
@@ -7,9 +7,10 @@ CREATE TABLE IF NOT EXISTS t_clean_template
id VARCHAR(64) PRIMARY KEY,
name VARCHAR(64) UNIQUE,
description VARCHAR(256),
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- created_by VARCHAR(256)
+ created_by VARCHAR(256),
+ updated_by VARCHAR(256),
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
COMMENT ON TABLE t_clean_template IS '清洗模板表';
@@ -19,6 +20,7 @@ COMMENT ON COLUMN t_clean_template.description IS '模板描述';
COMMENT ON COLUMN t_clean_template.created_at IS '创建时间';
COMMENT ON COLUMN t_clean_template.updated_at IS '更新时间';
COMMENT ON COLUMN t_clean_template.created_by IS '创建者';
+COMMENT ON COLUMN t_clean_template.updated_by IS '更新者';
-- 清洗任务表
CREATE TABLE IF NOT EXISTS t_clean_task
@@ -180,4 +182,4 @@ VALUES
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, NULL),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, NULL),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, NULL)
- ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
\ No newline at end of file
+ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING;
diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql
index 0587b841..e6650e4c 100644
--- a/scripts/db/data-operator-init.sql
+++ b/scripts/db/data-operator-init.sql
@@ -49,6 +49,10 @@ CREATE TABLE IF NOT EXISTS t_operator_release
version VARCHAR(255),
release_date TIMESTAMP,
changelog JSON,
+ created_by VARCHAR(255),
+ updated_by VARCHAR(255),
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id, version)
);
@@ -60,7 +64,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category
value VARCHAR(64) UNIQUE,
type VARCHAR(64),
parent_id VARCHAR(64),
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+ created_by VARCHAR(255),
+ updated_by VARCHAR(255),
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
COMMENT ON TABLE t_operator_category IS '算子分类表';
@@ -76,6 +83,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category_relation
(
category_id VARCHAR(64),
operator_id VARCHAR(64),
+ created_by VARCHAR(255),
+ updated_by VARCHAR(255),
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (category_id, operator_id)
);
@@ -207,9 +218,6 @@ VALUES
('ObjectDetectionRectangle', '图像目标检测与预标注', '基于 YOLOv8 的图像目标检测算子。对输入图像进行目标检测,输出带矩形框与类别标签的标注图像,并生成结构化标注 JSON(包含类别、置信度与边界框坐标)。支持将检测结果导出为 Label Studio 兼容的 predictions 预标注格式(rectanglelabels),可在标注任务中直接加载并进行人工校正,从而显著降低人工标注成本并提升标注效率。', '1.0.0', 'image', 'image,json', null, null, '', 12288, false, 'system', 'system')
ON CONFLICT DO NOTHING;
-INSERT INTO t_operator_release(id, version, release_date, changelog)
-VALUES ('MineruFormatter', '1.0.0', '2026-03-30', '["aaa","bbb"]');
-
INSERT INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
diff --git a/scripts/images/backend-python/Dockerfile b/scripts/images/backend-python/Dockerfile
index 4d276dd0..cf24083c 100644
--- a/scripts/images/backend-python/Dockerfile
+++ b/scripts/images/backend-python/Dockerfile
@@ -55,13 +55,15 @@ ENV NLTK_DATA=/usr/local/nltk_data
# Copy the rest of the application
COPY runtime/datamate-python /app
+COPY runtime/ops/examples/test_operator/test_operator.tar /app/test_operator.tar
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/
COPY runtime/datamate-python/deploy/docker-entrypoint.sh /docker-entrypoint.sh
RUN chmod +x /docker-entrypoint.sh \
- && dos2unix /docker-entrypoint.sh || true
+ && dos2unix /docker-entrypoint.sh || true \
+ && ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
# Expose the application port
EXPOSE 18000