From 08952f8d1c815de8eb1965a2dbb2093122b4d770 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Sat, 31 Jan 2026 17:09:11 +0800 Subject: [PATCH 01/20] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E7=AE=97=E5=AD=90=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../infrastructure/persistence/Impl/OperatorRepositoryImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java index 7b43869b..a36b2649 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java @@ -48,7 +48,7 @@ public int countOperatorByStar(boolean isStar) { @Override public boolean operatorInTemplateOrRunning(String operatorId) { - return mapper.operatorInTemplate(operatorId) > 0 && mapper.operatorInUnstopTask(operatorId) > 0; + return mapper.operatorInTemplate(operatorId) > 0 || mapper.operatorInUnstopTask(operatorId) > 0; } @Override From e0da089a61ec6246e615d0830606b33726e8c1fa Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Sat, 31 Jan 2026 18:04:29 +0800 Subject: [PATCH 02/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gateway/ApiGatewayApplication.java | 4 + .../datamate-python/app/db/models/__init__.py | 16 + .../app/db/models/chunk_upload.py | 38 ++ .../datamate-python/app/db/models/operator.py | 70 ++ .../datamate-python/app/module/__init__.py | 4 + .../app/module/operator/README.md | 138 ++++ .../app/module/operator/__init__.py | 4 + .../app/module/operator/constants.py | 50 ++ .../app/module/operator/exceptions.py | 72 +++ .../app/module/operator/interface/__init__.py | 9 + .../operator/interface/category_routes.py | 43 ++ .../operator/interface/operator_routes.py | 270 ++++++++ .../app/module/operator/parsers/__init__.py | 15 + .../operator/parsers/abstract_parser.py | 97 +++ .../module/operator/parsers/parser_holder.py | 52 ++ .../app/module/operator/parsers/tar_parser.py | 41 ++ .../app/module/operator/parsers/zip_parser.py | 41 ++ .../module/operator/repository/__init__.py | 15 + .../category_relation_repository.py | 77 +++ .../repository/category_repository.py | 23 + .../repository/operator_release_repository.py | 72 +++ .../repository/operator_repository.py | 121 ++++ .../app/module/operator/schema/__init__.py | 29 + .../app/module/operator/schema/category.py | 40 ++ .../app/module/operator/schema/operator.py | 72 +++ .../app/module/operator/schema/release.py | 22 + .../app/module/operator/service/__init__.py | 11 + .../operator/service/category_service.py | 101 +++ .../operator/service/operator_service.py | 599 ++++++++++++++++++ .../app/module/shared/__init__.py | 21 + .../module/shared/chunk_upload_repository.py | 95 +++ .../app/module/shared/chunks_saver.py | 146 +++++ .../app/module/shared/file_models.py | 38 ++ .../app/module/shared/file_service.py | 187 ++++++ scripts/db/data-operator-init.sql | 13 +- scripts/images/backend-python/Dockerfile | 1 + 36 files changed, 2646 insertions(+), 1 deletion(-) create mode 100644 runtime/datamate-python/app/db/models/chunk_upload.py create mode 100644 runtime/datamate-python/app/db/models/operator.py create mode 100644 runtime/datamate-python/app/module/operator/README.md create mode 100644 runtime/datamate-python/app/module/operator/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/constants.py create mode 100644 runtime/datamate-python/app/module/operator/exceptions.py create mode 100644 runtime/datamate-python/app/module/operator/interface/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/interface/category_routes.py create mode 100644 runtime/datamate-python/app/module/operator/interface/operator_routes.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/abstract_parser.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/parser_holder.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/tar_parser.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/zip_parser.py create mode 100644 runtime/datamate-python/app/module/operator/repository/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/repository/category_relation_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/category_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/operator_release_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/operator_repository.py create mode 100644 runtime/datamate-python/app/module/operator/schema/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/schema/category.py create mode 100644 runtime/datamate-python/app/module/operator/schema/operator.py create mode 100644 runtime/datamate-python/app/module/operator/schema/release.py create mode 100644 runtime/datamate-python/app/module/operator/service/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/service/category_service.py create mode 100644 runtime/datamate-python/app/module/operator/service/operator_service.py create mode 100644 runtime/datamate-python/app/module/shared/chunk_upload_repository.py create mode 100644 runtime/datamate-python/app/module/shared/chunks_saver.py create mode 100644 runtime/datamate-python/app/module/shared/file_models.py create mode 100644 runtime/datamate-python/app/module/shared/file_service.py diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java index ee504973..de9e1f28 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java @@ -45,6 +45,10 @@ public RouteLocator customRouteLocator(RouteLocatorBuilder builder) { .route("python-service", r -> r.path("/api/rag/**", "api/models/**") .uri("http://datamate-backend-python:18000")) + // 数据评估服务路由 + .route("data-operator", r -> r.path("/api/operators/**", "api/categories/**") + .uri("http://datamate-backend-python:18000")) + .route("deer-flow-frontend", r -> r.path("/chat/**") .uri("http://deer-flow-frontend:3000")) diff --git a/runtime/datamate-python/app/db/models/__init__.py b/runtime/datamate-python/app/db/models/__init__.py index 2b83de26..060e4b64 100644 --- a/runtime/datamate-python/app/db/models/__init__.py +++ b/runtime/datamate-python/app/db/models/__init__.py @@ -21,6 +21,17 @@ EvaluationItem ) +from .operator import ( + Operator, + Category, + CategoryRelation, + OperatorRelease +) + +from .chunk_upload import ( + ChunkUploadPreRequest +) + __all__ = [ "Dataset", "DatasetTag", @@ -32,4 +43,9 @@ "LabelingProject", "EvaluationTask", "EvaluationItem", + "Operator", + "Category", + "CategoryRelation", + "OperatorRelease", + "ChunkUploadPreRequest", ] diff --git a/runtime/datamate-python/app/db/models/chunk_upload.py b/runtime/datamate-python/app/db/models/chunk_upload.py new file mode 100644 index 00000000..5b5a2b0c --- /dev/null +++ b/runtime/datamate-python/app/db/models/chunk_upload.py @@ -0,0 +1,38 @@ +""" +Chunk Upload Database Model +分片上传数据库模型 +""" +from sqlalchemy import Column, String, Integer, DateTime +from sqlalchemy.sql import func + +from app.db.models.base_entity import Base, BaseEntity + + +class ChunkUploadPreRequest(BaseEntity): + """分片上传预请求""" + __tablename__ = "t_chunk_upload_request" + + id = Column(String(36), primary_key=True, comment="请求ID") + total_file_num = Column(Integer, nullable=False, comment="总文件数") + uploaded_file_num = Column(Integer, nullable=True, comment="已上传文件数") + upload_path = Column(String(512), nullable=False, comment="文件路径") + timeout = Column(DateTime, nullable=False, comment="上传请求超时时间") + service_id = Column(String(64), nullable=True, comment="上传请求所属服务ID") + check_info = Column(String(512), nullable=True, comment="业务信息") + + def increment_uploaded_file_num(self): + """增加已上传文件数""" + if self.uploaded_file_num is None: + self.uploaded_file_num = 1 + else: + self.uploaded_file_num += 1 + + def is_upload_complete(self) -> bool: + """检查是否已完成上传""" + return (self.uploaded_file_num is not None and + self.uploaded_file_num == self.total_file_num) + + def is_request_timeout(self) -> bool: + """检查是否已超时""" + from datetime import datetime, timezone + return self.timeout is not None and datetime.now(timezone.utc) > self.timeout diff --git a/runtime/datamate-python/app/db/models/operator.py b/runtime/datamate-python/app/db/models/operator.py new file mode 100644 index 00000000..57362461 --- /dev/null +++ b/runtime/datamate-python/app/db/models/operator.py @@ -0,0 +1,70 @@ +""" +Operator Market Data Models +算子市场数据模型 +""" +from sqlalchemy import Column, String, Integer, Boolean, BigInteger, Text, JSON, TIMESTAMP, Index +from sqlalchemy.sql import func + +from app.db.models.base_entity import Base, BaseEntity + + +class Operator(BaseEntity): + """算子实体""" + __tablename__ = "t_operator" + + id = Column(String(36), primary_key=True, index=True, comment="算子ID") + name = Column(String(255), nullable=False, comment="算子名称") + description = Column(Text, nullable=True, comment="算子描述") + version = Column(String(50), nullable=False, comment="算子版本") + inputs = Column(Text, nullable=True, comment="输入定义(JSON)") + outputs = Column(Text, nullable=True, comment="输出定义(JSON)") + runtime = Column(Text, nullable=True, comment="运行时配置(JSON)") + settings = Column(Text, nullable=True, comment="算子设置(JSON)") + file_name = Column(String(255), nullable=True, comment="文件名") + file_size = Column(BigInteger, nullable=True, comment="文件大小(字节)") + metrics = Column(Text, nullable=True, comment="算子指标(JSON)") + usage_count = Column(Integer, default=0, nullable=False, comment="使用次数") + is_star = Column(Boolean, default=False, nullable=False, comment="是否收藏") + + __table_args__ = ( + Index("idx_is_star", "is_star"), + ) + + +class Category(BaseEntity): + """算子分类实体""" + __tablename__ = "t_operator_category" + + id = Column(String(36), primary_key=True, index=True, comment="分类ID") + name = Column(String(255), nullable=False, comment="分类名称") + value = Column(String(255), nullable=True, comment="分类值") + type = Column(String(50), nullable=True, comment="分类类型") + parent_id = Column(String(36), nullable=False, default="0", comment="父分类ID") + + +class CategoryRelation(BaseEntity): + """算子分类关系实体""" + __tablename__ = "t_operator_category_relation" + + category_id = Column(String(36), primary_key=True, comment="分类ID") + operator_id = Column(String(36), primary_key=True, comment="算子ID") + + __table_args__ = ( + Index("idx_category_id", "category_id"), + Index("idx_operator_id", "operator_id"), + ) + + +class OperatorRelease(BaseEntity): + """算子发布版本实体""" + __tablename__ = "t_operator_release" + + id = Column(String(36), primary_key=True, comment="算子ID") + version = Column(String(50), primary_key=True, comment="版本号") + release_date = Column(TIMESTAMP, nullable=False, default=func.now(), comment="发布时间") + changelog = Column(JSON, nullable=True, comment="更新日志列表") + + +# Ignore data scope for operator models +for model in [Operator, Category, CategoryRelation, OperatorRelease]: + model.__ignore_data_scope__ = True diff --git a/runtime/datamate-python/app/module/__init__.py b/runtime/datamate-python/app/module/__init__.py index 7d3c482b..9437b11d 100644 --- a/runtime/datamate-python/app/module/__init__.py +++ b/runtime/datamate-python/app/module/__init__.py @@ -7,6 +7,8 @@ from .evaluation.interface import router as evaluation_router from .collection.interface import router as collection_route from .rag.interface.rag_interface import router as rag_router +from .operator.interface import operator_router +from .operator.interface import category_router router = APIRouter( prefix="/api" @@ -19,5 +21,7 @@ router.include_router(evaluation_router) router.include_router(collection_route) router.include_router(rag_router) +router.include_router(operator_router) +router.include_router(category_router) __all__ = ["router"] diff --git a/runtime/datamate-python/app/module/operator/README.md b/runtime/datamate-python/app/module/operator/README.md new file mode 100644 index 00000000..703e8ed3 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/README.md @@ -0,0 +1,138 @@ +# Operator Market Service - Python Implementation + +## 概述 + +这是 `operator-market-service` 的 Python 实现,已集成到 `runtime/datamate-python` 项目中。 + +## 功能 + +- **算子管理**:创建、查询、更新、删除算子 +- **分类管理**:树状分类结构查询 +- **文件上传**:支持算子文件上传和解析(支持 tar/zip 格式) +- **MCP 工具集成**:通过 fastapi-mcp 提供 MCP 工具接口 + +## 目录结构 + +``` +app/module/operator_market/ +├── __init__.py # 模块入口 +├── constants.py # 常量定义 +├── exceptions.py # 异常定义 +├── schema/ # Pydantic Schema 定义 +│ ├── __init__.py +│ ├── operator.py # 算子相关 Schema +│ ├── category.py # 分类相关 Schema +│ └── release.py # 发布版本 Schema +├── parsers/ # 文件解析器 +│ ├── __init__.py +│ ├── abstract_parser.py # 抽象解析器基类 +│ ├── tar_parser.py # TAR 文件解析器 +│ ├── zip_parser.py # ZIP 文件解析器 +│ └── parser_holder.py # 解析器持有者 +├── repository/ # 数据访问层 +│ ├── __init__.py +│ ├── operator_repository.py +│ ├── category_repository.py +│ ├── category_relation_repository.py +│ └── operator_release_repository.py +├── service/ # 服务层 +│ ├── __init__.py +│ ├── operator_service.py +│ └── category_service.py +└── interface/ # API 接口层 + ├── __init__.py + ├── operator_routes.py + └── category_routes.py +``` + +## API 端点 + +### 算子相关 (`/api/operator-market/operators`) + +| 方法 | 路径 | 描述 | +|------|--------|------| +| POST | `/list` | 查询算子列表(支持分页、分类过滤、关键词搜索) | +| GET | `/{operator_id}` | 获取算子详情 | +| PUT | `/{operator_id}` | 更新算子信息 | +| POST | `/create` | 创建新算子 | +| POST | `/upload` | 上传算子文件 | +| POST | `/upload/pre-upload` | 预上传(获取请求 ID) | +| POST | `/upload/chunk` | 分块上传 | +| DELETE | `/{operator_id}` | 删除算子 | +| GET | `/examples/download` | 下载示例算子 | + +### 分类相关 (`/api/operator-market/categories`) + +| 方法 | 路径 | 描述 | +|------|--------|------| +| GET | `/tree` | 获取分类树状结构 | + +## 数据库表 + +- `t_operator` - 算子表 +- `t_operator_category` - 分类表 +- `t_operator_category_relation` - 分类关系表 +- `t_operator_release` - 算子发布版本表 +- `v_operator` - 算子视图(包含分类信息) + +## 文件格式支持 + +算子文件需包含 `metadata.yml` 文件,格式如下: + +```yaml +raw_id: "operator-id" +name: "算子名称" +description: "算子描述" +version: "1.0.0" +language: "python" # python, java +modal: "text" # text, image, audio, video +vendor: "datamate" # datamate, data-juicer, or other +inputs: {...} +outputs: {...} +runtime: {...} +settings: {...} +metrics: {...} +release: + - "更新日志1" + - "更新日志2" +``` + +## 待实现功能 + +- [ ] 算子收藏功能完善 +- [ ] 标签过滤功能 + +## 使用示例 + +### 查询算子列表 + +```bash +curl -X POST "http://localhost:18000/api/operator-market/operators/list" \ + -H "Content-Type: application/json" \ + -d '{ + "page": 1, + "size": 10, + "keyword": "test", + "isStar": false + }' +``` + +### 获取分类树 + +```bash +curl -X GET "http://localhost:18000/api/operator-market/categories/tree" +``` + +### 创建算子 + +```bash +curl -X POST "http://localhost:18000/api/operator-market/operators/create" \ + -H "Content-Type: application/json" \ + -d '{ + "id": "new-operator-id", + "name": "新算子", + "description": "这是一个新算子", + "version": "1.0.0", + "fileName": "operator.tar" + }' +``` diff --git a/runtime/datamate-python/app/module/operator/__init__.py b/runtime/datamate-python/app/module/operator/__init__.py new file mode 100644 index 00000000..1ac84e31 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/__init__.py @@ -0,0 +1,4 @@ +""" +Operator Market Service Module +算子市场服务模块 +""" diff --git a/runtime/datamate-python/app/module/operator/constants.py b/runtime/datamate-python/app/module/operator/constants.py new file mode 100644 index 00000000..e6d83ee9 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/constants.py @@ -0,0 +1,50 @@ +""" +Operator Market Constants +算子市场常量定义 +""" + +# Service ID +SERVICE_ID = "operator" + +# YAML metadata path +YAML_PATH = "metadata.yml" + +# Example operator file path +EXAMPLE_OPERATOR_PATH = "/app/test_operator.tar" + +# Category IDs +CATEGORY_PYTHON = "python" +CATEGORY_PYTHON_ID = "9eda9d5d-072b-499b-916c-797a0a8750e1" + +CATEGORY_JAVA = "java" +CATEGORY_JAVA_ID = "b5bfc548-8ef6-417c-b8a6-a4197c078249" + +CATEGORY_CUSTOMIZED_ID = "ec2cdd17-8b93-4a81-88c4-ac9e98d10757" +CATEGORY_TEXT_ID = "d8a5df7a-52a9-42c2-83c4-01062e60f597" +CATEGORY_IMAGE_ID = "de36b61c-9e8a-4422-8c31-d30585c7100f" +CATEGORY_AUDIO_ID = "42dd9392-73e4-458c-81ff-41751ada47b5" +CATEGORY_VIDEO_ID = "a233d584-73c8-4188-ad5d-8f7c8dda9c27" +CATEGORY_ALL_ID = "4d7dbd77-0a92-44f3-9056-2cd62d4a71e4" +CATEGORY_STAR_ID = "51847c24-bba9-11f0-888b-5b143cb738aa" +CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3" +CATEGORY_DATAMATE_ID = "431e7798-5426-4e1a-aae6-b9905a836b34" +CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996" +CATEGORY_OTHER_VENDOR_ID = "f00eaa3e-96c1-4de4-96cd-9848ef5429ec" + +# Category mapping +CATEGORY_MAP = { + CATEGORY_PYTHON: CATEGORY_PYTHON_ID, + CATEGORY_JAVA: CATEGORY_JAVA_ID, + "text": CATEGORY_TEXT_ID, + "image": CATEGORY_IMAGE_ID, + "audio": CATEGORY_AUDIO_ID, + "video": CATEGORY_VIDEO_ID, + "all": CATEGORY_ALL_ID, + "datamate": CATEGORY_DATAMATE_ID, + "data-juicer": CATEGORY_DATA_JUICER_ID, +} + +# File paths +OPERATOR_BASE_PATH = "/operators" +UPLOAD_DIR = "upload" +EXTRACT_DIR = "extract" diff --git a/runtime/datamate-python/app/module/operator/exceptions.py b/runtime/datamate-python/app/module/operator/exceptions.py new file mode 100644 index 00000000..6eca13f5 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/exceptions.py @@ -0,0 +1,72 @@ +""" +Operator Market Exceptions +算子市场异常定义 +""" +from enum import Enum +from typing import Optional + + +class OperatorErrorCode: + """算子错误码""" + def __init__(self, message: str, error_code: str): + self.message = message + self.error_code = error_code + + +class OperatorException(RuntimeError): + """算子异常基类""" + def __init__(self, operator_error_code: OperatorErrorCode): + self.message = operator_error_code.message + self.error_code = operator_error_code.error_code + super().__init__(self.message) + + +class OperatorErrorCodeEnum(Enum): + """算子错误码枚举""" + FIELD_NOT_FOUND = OperatorErrorCode( + "必填字段缺失", "OPERATOR_FIELD_NOT_FOUND" + ) + SETTINGS_PARSE_FAILED = OperatorErrorCode( + "设置解析失败", "OPERATOR_SETTINGS_PARSE_FAILED" + ) + OPERATOR_IN_INSTANCE = OperatorErrorCode( + "算子正在使用中", "OPERATOR_IN_INSTANCE" + ) + CANT_DELETE_PREDEFINED_OPERATOR = OperatorErrorCode( + "无法删除预定义算子", "CANT_DELETE_PREDEFINED_OPERATOR" + ) + + +class FieldNotFoundError(OperatorException): + """必填字段缺失""" + def __init__(self, field_name: str): + super().__init__( + OperatorErrorCodeEnum.FIELD_NOT_FOUND.value + ) + self.message = f"Required field '{field_name}' is missing" + self.field_name = field_name + + +class SettingsParseError(OperatorException): + """设置解析失败""" + def __init__(self, detail: Optional[str] = None): + super().__init__( + OperatorErrorCodeEnum.SETTINGS_PARSE_FAILED.value + ) + self.detail = detail + + +class OperatorInInstanceError(OperatorException): + """算子正在使用中""" + def __init__(self): + super().__init__( + OperatorErrorCodeEnum.OPERATOR_IN_INSTANCE.value + ) + + +class CannotDeletePredefinedOperatorError(OperatorException): + """无法删除预定义算子""" + def __init__(self): + super().__init__( + OperatorErrorCodeEnum.CANT_DELETE_PREDEFINED_OPERATOR.value + ) diff --git a/runtime/datamate-python/app/module/operator/interface/__init__.py b/runtime/datamate-python/app/module/operator/interface/__init__.py new file mode 100644 index 00000000..f83ad24f --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/__init__.py @@ -0,0 +1,9 @@ +""" +Operator Market API Interfaces +算子市场 API 接口层 +""" +from .operator_routes import router as operator_router +from .category_routes import router as category_router + + +__all__ = ["operator_router", "category_router"] diff --git a/runtime/datamate-python/app/module/operator/interface/category_routes.py b/runtime/datamate-python/app/module/operator/interface/category_routes.py new file mode 100644 index 00000000..ed4207e0 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/category_routes.py @@ -0,0 +1,43 @@ +""" +Category API Routes +分类 API 路由 +""" +from fastapi import APIRouter, Depends + +from app.db.session import get_db +from app.module.shared.schema import StandardResponse +from app.module.operator.schema import CategoryTreePagedResponse +from app.module.operator.service import CategoryService +from app.module.operator.repository import ( + CategoryRepository, + CategoryRelationRepository, +) +from app.module.operator.repository.operator_repository import OperatorRepository +from app.db.models.operator import Category, CategoryRelation, Operator + +router = APIRouter(prefix="/categories", tags=["Category"]) + + +def get_category_service() -> CategoryService: + """获取分类服务实例""" + return CategoryService( + category_repo=CategoryRepository(Category()), + category_relation_repo=CategoryRelationRepository(CategoryRelation()), + operator_repo=OperatorRepository(Operator()), + ) + + +@router.get( + "/tree", + response_model=StandardResponse[CategoryTreePagedResponse], + summary="获取分类树", + description="获取算子树状分类结构,包含分组维度(如语言、模态)及资源统计数量", + tags=['mcp'] +) +async def get_category_tree( + service: CategoryService = Depends(get_category_service), + db=Depends(get_db) +): + """获取分类树""" + result = await service.get_all_categories(db) + return StandardResponse(code=200, message="success", data=result) diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py new file mode 100644 index 00000000..8a1911d2 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -0,0 +1,270 @@ +""" +Operator API Routes +算子 API 路由 +""" +from pathlib import Path +from typing import List, Optional + +from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form +from fastapi.responses import FileResponse + +from app.db.session import get_db +from app.module.shared.schema import StandardResponse, PaginatedData +from app.module.operator.schema import ( + OperatorDto, + OperatorUpdateDto, + OperatorListRequest, + PreUploadResponse, +) +from app.module.operator.service import OperatorService +from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, +) +from app.module.operator.parsers import ParserHolder +from app.db.models.operator import Operator, CategoryRelation, OperatorRelease +from app.core.logging import get_logger +from app.module.shared.file_service import FileService +from app.module.shared.chunk_upload_repository import ChunkUploadRepository +from app.db.models.chunk_upload import ChunkUploadPreRequest + +logger = get_logger(__name__) + +router = APIRouter(prefix="/operators", tags=["Operator"]) + +def get_operator_service() -> OperatorService: + """获取算子服务实例""" + return OperatorService( + operator_repo=OperatorRepository(Operator()), + category_relation_repo=CategoryRelationRepository(CategoryRelation()), + operator_release_repo=OperatorReleaseRepository(OperatorRelease()), + parser_holder=ParserHolder(), + file_service=FileService(ChunkUploadRepository()), + ) + + +@router.post( + "/list", + response_model=StandardResponse[PaginatedData[OperatorDto]], + summary="查询算子列表", + description="根据参数查询算子列表(支持分页、分类过滤、关键词搜索)", + tags=['mcp'] +) +async def list_operators( + request: OperatorListRequest, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """查询算子列表""" + operators = await service.get_operators( + page=request.page, + size=request.size, + categories=request.categories, + keyword=request.keyword, + is_star=request.is_star, + db=db + ) + + count = await service.count_operators( + categories=request.categories, + keyword=request.keyword, + is_star=request.is_star, + db=db + ) + + total_pages = (count + request.size - 1) // request.size # Ceiling division + + return StandardResponse( + code=200, + message="success", + data=PaginatedData( + page=request.page, + size=request.size, + total_elements=count, + total_pages=total_pages, + content=operators, + ) + ) + + +@router.get( + "/{operator_id}", + response_model=StandardResponse[OperatorDto], + summary="获取算子详情", + description="根据 ID 获取算子详细信息" +) +async def get_operator( + operator_id: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """获取算子详情""" + try: + operator = await service.get_operator_by_id(operator_id, db) + return StandardResponse(code=200, message="success", data=operator) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) + + +@router.put( + "/{operator_id}", + response_model=StandardResponse[OperatorDto], + summary="更新算子", + description="更新算子信息" +) +async def update_operator( + operator_id: str, + request: OperatorUpdateDto, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """更新算子""" + try: + operator = await service.update_operator(operator_id, request, db) + await db.commit() + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + logger.error(f"{operator_id} {request}", e) + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/create", + response_model=StandardResponse[OperatorDto], + summary="创建算子", + description="创建新算子" +) +async def create_operator( + request: OperatorDto, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """创建算子""" + try: + operator = await service.create_operator(request, db) + await db.commit() + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/upload", + response_model=StandardResponse[OperatorDto], + summary="上传算子", + description="上传算子文件并解析元数据" +) +async def upload_operator( + file_name: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """上传算子""" + try: + operator = await service.upload_operator(file_name, db) + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/upload/pre-upload", + response_model=StandardResponse[PreUploadResponse], + summary="预上传", + description="获取预上传 ID,用于分块上传" +) +async def pre_upload( + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """预上传""" + result = await service.pre_upload(db) + return StandardResponse( + code=200, + message="success", + data=PreUploadResponse(req_id=result["req_id"]) + ) + + +@router.post( + "/upload/chunk", + response_model=StandardResponse[dict], + summary="分块上传", + description="分块上传算子文件" +) +async def chunk_upload( + req_id: str = Form(..., description="预上传ID"), + file_no: int = Form(1, description="文件编号"), + file_name: str = Form(..., description="文件名"), + total_chunk_num: int = Form(1, description="总分块数"), + chunk_no: int = Form(1, description="当前分块号"), + file: UploadFile = ..., + check_sum_hex: Optional[str] = Form(None, description="校验和"), + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """分块上传""" + try: + file_content = await file.read() + result = await service.chunk_upload( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + file_content=file_content, + db=db + ) + await db.commit() + return StandardResponse(code=200, message="success", data=result.dict()) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.delete( + "/{operator_id}", + response_model=StandardResponse[None], + summary="删除算子", + description="删除算子" +) +async def delete_operator( + operator_id: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """删除算子""" + try: + await service.delete_operator(operator_id, db) + await db.commit() + return StandardResponse(code=200, message="success", data=None) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/examples/download", + response_class=FileResponse, + summary="下载示例算子", + description="下载示例算子文件" +) +async def download_example_operator( + service: OperatorService = Depends(get_operator_service) +): + """下载示例算子""" + from app.module.operator.constants import EXAMPLE_OPERATOR_PATH + example_path = EXAMPLE_OPERATOR_PATH + try: + file_path = service.download_example_operator(example_path) + return FileResponse( + path=str(file_path), + filename=file_path.name, + media_type="application/octet-stream" + ) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Example file not found") diff --git a/runtime/datamate-python/app/module/operator/parsers/__init__.py b/runtime/datamate-python/app/module/operator/parsers/__init__.py new file mode 100644 index 00000000..db3c0504 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/__init__.py @@ -0,0 +1,15 @@ +""" +Operator File Parsers +算子文件解析器 +""" +from .abstract_parser import AbstractParser +from .tar_parser import TarParser +from .zip_parser import ZipParser +from .parser_holder import ParserHolder + +__all__ = [ + "AbstractParser", + "TarParser", + "ZipParser", + "ParserHolder", +] diff --git a/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py new file mode 100644 index 00000000..27e9aa3c --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py @@ -0,0 +1,97 @@ +""" +Abstract Parser +抽象解析器基类 +""" +import json +import yaml +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional + +from app.module.operator.schema import OperatorDto, OperatorReleaseDto +from app.module.operator.constants import CATEGORY_MAP, CATEGORY_OTHER_VENDOR_ID, CATEGORY_CUSTOMIZED_ID +from app.module.operator.exceptions import FieldNotFoundError + + +class AbstractParser(ABC): + """算子文件解析器抽象基类""" + + @abstractmethod + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """ + 从压缩包内读取指定路径的 yaml 文件并解析为 OperatorDto + + Args: + archive_path: 压缩包路径(zip 或 tar) + entry_path: 压缩包内部的文件路径,例如 "config/app.yaml" + + Returns: + 解析后的 OperatorDto + """ + pass + + @abstractmethod + def extract_to(self, archive_path: str, target_dir: str) -> None: + """ + 将压缩包解压到目标目录(保持相对路径) + + Args: + archive_path: 压缩包路径 + target_dir: 目标目录 + """ + pass + + def parse_yaml(self, yaml_content: str) -> OperatorDto: + """解析 YAML 内容为 OperatorDto""" + content: Dict[str, Any] = yaml.safe_load(yaml_content) + + operator = OperatorDto( + id=self._to_string(content.get("raw_id")), + name=self._to_string(content.get("name")), + description=self._to_string(content.get("description")), + version=self._to_string(content.get("version")), + inputs=self._to_json(content.get("inputs")), + outputs=self._to_json(content.get("outputs")), + runtime=self._to_json(content.get("runtime")), + settings=self._to_json(content.get("settings")), + metrics=self._to_json(content.get("metrics")), + ) + + # Handle changelog + changelog = content.get("release") + if isinstance(changelog, list): + operator_release = OperatorReleaseDto(changelog=changelog) + else: + operator_release = OperatorReleaseDto(changelog=[]) + operator.releases = [operator_release] + + # Build categories + categories = [ + CATEGORY_MAP.get(self._to_lower(content.get("language")), ""), + CATEGORY_MAP.get(self._to_lower(content.get("modal")), ""), + CATEGORY_MAP.get(self._to_lower(content.get("vendor")), CATEGORY_OTHER_VENDOR_ID), + CATEGORY_CUSTOMIZED_ID, + ] + operator.categories = categories + + return operator + + def _to_string(self, obj: Any) -> str: + """转换为字符串""" + if obj is None: + raise FieldNotFoundError("field") + return str(obj) + + def _to_lower(self, obj: Any) -> str: + """转换为小写字符串""" + if obj is None: + raise FieldNotFoundError("field") + return str(obj).lower() + + def _to_json(self, obj: Any) -> Optional[str]: + """转换为 JSON 字符串""" + if obj is None: + return None + try: + return json.dumps(obj) + except (TypeError, ValueError) as e: + raise ValueError(f"Failed to serialize to JSON: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/parser_holder.py b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py new file mode 100644 index 00000000..e4a79d63 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py @@ -0,0 +1,52 @@ +""" +Parser Holder +解析器持有者,根据文件类型选择合适的解析器 +""" +import os +from typing import Dict, Type + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.parsers.tar_parser import TarParser +from app.module.operator.parsers.zip_parser import ZipParser +from app.module.operator.schema import OperatorDto + + +class ParserHolder: + """解析器持有者,根据文件类型选择解析器""" + + def __init__(self): + self._parsers: Dict[str, AbstractParser] = { + "tar": TarParser(), + "gz": TarParser(), + "tgz": TarParser(), + "zip": ZipParser(), + } + + def get_parser(self, file_path: str) -> AbstractParser: + """根据文件扩展名获取解析器""" + _, ext = os.path.splitext(file_path) + file_type = ext.lstrip('.').lower() + + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + return self._parsers[file_type] + + def parse_yaml_from_archive( + self, + file_type: str, + archive_path: str, + entry_path: str + ) -> OperatorDto: + """从压缩包解析 YAML""" + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + return self._parsers[file_type].parse_yaml_from_archive(archive_path, entry_path) + + def extract_to(self, file_type: str, archive_path: str, target_dir: str) -> None: + """解压文件到目标目录""" + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + self._parsers[file_type].extract_to(archive_path, target_dir) diff --git a/runtime/datamate-python/app/module/operator/parsers/tar_parser.py b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py new file mode 100644 index 00000000..e2618cfa --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py @@ -0,0 +1,41 @@ +""" +Tar File Parser +TAR 文件解析器 +""" +import tarfile +import os +from typing import Optional + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.schema import OperatorDto + + +class TarParser(AbstractParser): + """TAR 压缩包解析器""" + + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """从 TAR 文件中解析 YAML""" + try: + with tarfile.open(archive_path, 'r:*') as tar: + for member in tar.getmembers(): + if member.name == entry_path or member.name.endswith(f"/{entry_path}"): + file = tar.extractfile(member) + if file: + content = file.read().decode('utf-8') + return self.parse_yaml(content) + raise FileNotFoundError(f"File '{entry_path}' not found in archive") + except (tarfile.TarError, EOFError) as e: + raise ValueError(f"Failed to parse TAR file: {e}") + + def extract_to(self, archive_path: str, target_dir: str) -> None: + """解压 TAR 文件到目标目录""" + try: + os.makedirs(target_dir, exist_ok=True) + with tarfile.open(archive_path, 'r:*') as tar: + # Safety check: prevent path traversal + for member in tar.getmembers(): + if os.path.isabs(member.name) or ".." in member.name.split("/"): + raise ValueError(f"Unsafe path in archive: {member.name}") + tar.extractall(target_dir) + except (tarfile.TarError, EOFError) as e: + raise ValueError(f"Failed to extract TAR file: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/zip_parser.py b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py new file mode 100644 index 00000000..a1741efe --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py @@ -0,0 +1,41 @@ +""" +Zip File Parser +ZIP 文件解析器 +""" +import zipfile +import os +from typing import Optional + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.schema import OperatorDto + + +class ZipParser(AbstractParser): + """ZIP 压缩包解析器""" + + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """从 ZIP 文件中解析 YAML""" + try: + with zipfile.ZipFile(archive_path, 'r') as zf: + # Check all possible paths + for name in zf.namelist(): + if name == entry_path or name.endswith(f"/{entry_path}"): + with zf.open(name) as file: + content = file.read().decode('utf-8') + return self.parse_yaml(content) + raise FileNotFoundError(f"File '{entry_path}' not found in archive") + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + raise ValueError(f"Failed to parse ZIP file: {e}") + + def extract_to(self, archive_path: str, target_dir: str) -> None: + """解压 ZIP 文件到目标目录""" + try: + os.makedirs(target_dir, exist_ok=True) + with zipfile.ZipFile(archive_path, 'r') as zf: + # Safety check: prevent path traversal + for name in zf.namelist(): + if os.path.isabs(name) or ".." in name.split("/"): + raise ValueError(f"Unsafe path in archive: {name}") + zf.extractall(target_dir) + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + raise ValueError(f"Failed to extract ZIP file: {e}") diff --git a/runtime/datamate-python/app/module/operator/repository/__init__.py b/runtime/datamate-python/app/module/operator/repository/__init__.py new file mode 100644 index 00000000..67859d72 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/__init__.py @@ -0,0 +1,15 @@ +""" +Operator Market Repositories +算子市场数据访问层 +""" +from .operator_repository import OperatorRepository +from .category_repository import CategoryRepository +from .category_relation_repository import CategoryRelationRepository +from .operator_release_repository import OperatorReleaseRepository + +__all__ = [ + "OperatorRepository", + "CategoryRepository", + "CategoryRelationRepository", + "OperatorReleaseRepository", +] diff --git a/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py new file mode 100644 index 00000000..1edd5868 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py @@ -0,0 +1,77 @@ +""" +Category Relation Repository +分类关系数据访问层 +""" +from typing import List + +from sqlalchemy import select, delete, and_ +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import CategoryRelation +from app.module.operator.constants import CATEGORY_PREDEFINED_ID + + +class CategoryRelationRepository: + """分类关系数据访问层""" + + def __init__(self, model: CategoryRelation): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[CategoryRelation]: + """查询所有分类关系""" + result = await db.execute(select(self.model)) + return result.scalars().all() + + async def batch_insert( + self, + operator_id: str, + category_ids: List[str], + db: AsyncSession + ) -> None: + """批量插入分类关系""" + for category_id in category_ids: + entity = CategoryRelation( + category_id=category_id, + operator_id=operator_id + ) + db.add(entity) + + async def batch_update( + self, + operator_id: str, + category_ids: List[str], + db: AsyncSession + ) -> None: + """批量更新分类关系(先删除后插入)""" + # Delete existing relations + await db.execute( + delete(self.model) + .where(self.model.operator_id == operator_id) + ) + # Insert new relations + for category_id in category_ids: + entity = CategoryRelation( + category_id=category_id, + operator_id=operator_id + ) + db.add(entity) + + async def delete_by_operator_id(self, operator_id: str, db: AsyncSession) -> None: + """根据算子ID删除分类关系""" + await db.execute( + delete(self.model) + .where(self.model.operator_id == operator_id) + ) + + async def operator_is_predefined(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否为预定义算子""" + result = await db.execute( + select(self.model) + .where( + and_( + self.model.operator_id == operator_id, + self.model.category_id == CATEGORY_PREDEFINED_ID + ) + ) + ) + return result.first() is not None diff --git a/runtime/datamate-python/app/module/operator/repository/category_repository.py b/runtime/datamate-python/app/module/operator/repository/category_repository.py new file mode 100644 index 00000000..b5434d34 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/category_repository.py @@ -0,0 +1,23 @@ +""" +Category Repository +分类数据访问层 +""" +from typing import List + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import Category +from app.module.operator.schema import CategoryDto + + +class CategoryRepository: + """分类数据访问层""" + + def __init__(self, model: Category): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[Category]: + """查询所有分类""" + result = await db.execute(select(self.model)) + return result.scalars().all() diff --git a/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py new file mode 100644 index 00000000..bcab7be8 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py @@ -0,0 +1,72 @@ +""" +Operator Release Repository +算子发布版本数据访问层 +""" +from typing import List + +from sqlalchemy import select, delete, and_ +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import OperatorRelease +from app.module.operator.schema import OperatorReleaseDto + + +class OperatorReleaseRepository: + """算子发布版本数据访问层""" + + def __init__(self, model: OperatorRelease): + self.model = model + + async def find_all_by_operator_id( + self, + operator_id: str, + db: AsyncSession + ) -> List[OperatorRelease]: + """查询算子的所有发布版本""" + result = await db.execute( + select(OperatorRelease) + .where(OperatorRelease.id == operator_id) + .order_by(OperatorRelease.release_date.desc()) + ) + return result.scalars().all() + + async def insert( + self, + dto: OperatorReleaseDto, + db: AsyncSession + ) -> None: + """插入发布版本""" + entity = OperatorRelease( + id=dto.id, + version=dto.version, + release_date=dto.release_date, + changelog=dto.changelog + ) + db.add(entity) + + async def update( + self, + dto: OperatorReleaseDto, + db: AsyncSession + ) -> None: + """更新发布版本""" + result = await db.execute( + select(OperatorRelease) + .where( + and_( + OperatorRelease.id == dto.id, + OperatorRelease.version == dto.version + ) + ) + ) + entity = result.scalar_one_or_none() + if entity: + entity.changelog = dto.changelog + entity.release_date = dto.release_date + + async def delete(self, operator_id: str, db: AsyncSession) -> None: + """删除算子的所有发布版本""" + await db.execute( + delete(OperatorRelease) + .where(OperatorRelease.id == operator_id) + ) diff --git a/runtime/datamate-python/app/module/operator/repository/operator_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_repository.py new file mode 100644 index 00000000..990f7eb3 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/operator_repository.py @@ -0,0 +1,121 @@ +""" +Operator Repository +算子数据访问层 +""" +import json +from typing import List, Optional +from datetime import datetime, timezone + +from sqlalchemy import select, text, update +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import Operator +from app.module.operator.schema import OperatorDto + + +class OperatorRepository: + """算子数据访问层""" + + def __init__(self, model: Operator): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[Operator]: + """查询所有算子""" + result = await db.execute(select(Operator)) + return result.scalars().all() + + async def insert(self, dto: OperatorDto, db: AsyncSession) -> None: + """插入算子""" + entity = Operator( + id=dto.id, + name=dto.name, + description=dto.description, + version=dto.version, + inputs=dto.inputs, + outputs=dto.outputs, + runtime=dto.runtime, + settings=dto.settings, + file_name=dto.file_name, + file_size=dto.file_size, + metrics=dto.metrics, + usage_count=dto.usage_count or 0, + is_star=dto.is_star or False, + ) + db.add(entity) + + async def update(self, dto: OperatorDto, db: AsyncSession) -> None: + """更新算子""" + await db.execute( + update(Operator) + .where(Operator.id == dto.id) + .values( + name=dto.name, + description=dto.description, + version=dto.version, + inputs=dto.inputs, + outputs=dto.outputs, + runtime=dto.runtime, + settings=dto.settings, + file_name=dto.file_name, + file_size=dto.file_size, + metrics=dto.metrics, + is_star=dto.is_star, + updated_at=datetime.utcnow(), + ) + ) + + async def delete(self, operator_id: str, db: AsyncSession) -> None: + """删除算子""" + entity = await db.get(Operator, operator_id) + if entity: + await db.delete(entity) + + async def count_by_star(self, is_star: bool, db: AsyncSession) -> int: + """统计收藏算子数量""" + result = await db.execute( + select(text("COUNT(*)")) + .select_from(Operator) + .where(Operator.is_star == is_star) + ) + return result.scalar() or 0 + + async def operator_in_template(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否在模板中""" + result = await db.execute( + text(""" + SELECT COUNT(*) FROM t_operator_instance oi + JOIN t_clean_template t ON oi.instance_id = t.id + WHERE oi.operator_id = :operator_id + """), + {"operator_id": operator_id} + ) + return (result.scalar() or 0) > 0 + + async def operator_in_unstop_task(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否在未完成的任务中""" + result = await db.execute( + text(""" + SELECT COUNT(*) FROM t_operator_instance oi + JOIN t_clean_task t ON oi.instance_id = t.id + WHERE oi.operator_id = :operator_id AND t.status != 'COMPLETED' + """), + {"operator_id": operator_id} + ) + return (result.scalar() or 0) > 0 + + async def increment_usage_count( + self, + operator_ids: List[str], + db: AsyncSession + ) -> None: + """增加算子使用次数""" + if not operator_ids: + return + await db.execute( + update(Operator) + .where(Operator.id.in_(operator_ids)) + .values( + usage_count=Operator.usage_count + 1, + updated_at=datetime.now(timezone.utc), + ) + ) diff --git a/runtime/datamate-python/app/module/operator/schema/__init__.py b/runtime/datamate-python/app/module/operator/schema/__init__.py new file mode 100644 index 00000000..a084cbaf --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/__init__.py @@ -0,0 +1,29 @@ +""" +Operator Market Schemas +算子市场 Schema 定义 +""" +from .operator import ( + OperatorDto, + OperatorListRequest, + PreUploadResponse, + OperatorUpdateDto, +) +from .category import ( + CategoryDto, + CategoryTreeResponse, + CategoryTreePagedResponse, + CategoryRelationDto, +) +from .release import OperatorReleaseDto + +__all__ = [ + "OperatorDto", + "OperatorListRequest", + "PreUploadResponse", + "CategoryDto", + "CategoryTreeResponse", + "CategoryTreePagedResponse", + "CategoryRelationDto", + "OperatorReleaseDto", + "OperatorUpdateDto", +] diff --git a/runtime/datamate-python/app/module/operator/schema/category.py b/runtime/datamate-python/app/module/operator/schema/category.py new file mode 100644 index 00000000..afd6e3c5 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/category.py @@ -0,0 +1,40 @@ +""" +Category Schemas +分类 Schema 定义 +""" +from typing import List, Optional +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel + + +class CategoryDto(BaseResponseModel): + """分类 DTO""" + id: str = Field(..., description="分类ID") + name: str = Field(..., description="分类名称") + value: Optional[str] = Field(None, description="分类值") + type: Optional[str] = Field(None, description="分类类型") + parent_id: Optional[str] = Field(None, description="父分类ID") + count: Optional[int] = Field(0, description="算子数量") + created_at: Optional[datetime] = Field(None, description="创建时间") + + +class CategoryTreeResponse(BaseResponseModel): + """分类树响应""" + id: str = Field(..., description="分类ID") + name: str = Field(..., description="分类名称") + count: int = Field(0, description="算子总数") + categories: List[CategoryDto] = Field(default_factory=list, description="子分类列表") + + +class CategoryTreePagedResponse(BaseResponseModel): + """分类树分页响应""" + star_count: int = Field(0, description="收藏的算子数量") + categories: List[CategoryTreeResponse] = Field(default_factory=list, description="分类树列表") + + +class CategoryRelationDto(BaseResponseModel): + """分类关系 DTO""" + category_id: str = Field(..., description="分类ID") + operator_id: str = Field(..., description="算子ID") diff --git a/runtime/datamate-python/app/module/operator/schema/operator.py b/runtime/datamate-python/app/module/operator/schema/operator.py new file mode 100644 index 00000000..c53ed864 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/operator.py @@ -0,0 +1,72 @@ +""" +Operator Schemas +算子 Schema 定义 +""" +from __future__ import annotations + +from typing import List, Optional, Dict, Any +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel +from .release import OperatorReleaseDto + + +class OperatorDto(BaseResponseModel): + """算子 DTO""" + id: str = Field(..., description="算子ID") + name: str = Field(..., description="算子名称") + description: Optional[str] = Field(None, description="算子描述") + version: str = Field(..., description="算子版本") + inputs: Optional[str] = Field(None, description="输入定义(JSON)") + outputs: Optional[str] = Field(None, description="输出定义(JSON)") + runtime: Optional[str] = Field(None, description="运行时配置(JSON)") + settings: Optional[str] = Field(None, description="算子设置(JSON)") + file_name: Optional[str] = Field(None, description="文件名") + file_size: Optional[int] = Field(None, description="文件大小(字节)") + metrics: Optional[str] = Field(None, description="算子指标(JSON)") + usage_count: Optional[int] = Field(None, description="使用次数") + is_star: Optional[bool] = Field(None, description="是否收藏") + categories: Optional[List[str]] = Field(None, description="分类ID列表") + overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值") + requirements: Optional[List[str]] = Field(None, description="Python 依赖列表") + readme: Optional[str] = Field(None, description="README 内容") + releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表") + created_at: Optional[datetime] = Field(None, description="创建时间") + updated_at: Optional[datetime] = Field(None, description="更新时间") + + +class OperatorListRequest(BaseResponseModel): + """算子列表查询请求""" + page: int = Field(1, ge=0, description="页码(从0开始)") + size: int = Field(10, ge=1, le=100, description="页大小") + categories: List[List[str]] = Field(default_factory=list, description="分类ID列表(每个父分类下的id放到一个列表中)") + keyword: Optional[str] = Field(None, description="搜索关键词") + label_name: Optional[str] = Field(None, description="标签名称(暂不支持)") + is_star: Optional[bool] = Field(None, description="是否收藏") + + +class PreUploadResponse(BaseResponseModel): + """预上传响应""" + req_id: str = Field(..., description="请求ID") + + +class OperatorUpdateDto(BaseResponseModel): + """算子更新 DTO(所有字段可选)""" + name: Optional[str] = Field(None, description="算子名称") + description: Optional[str] = Field(None, description="算子描述") + version: Optional[str] = Field(None, description="算子版本") + inputs: Optional[str] = Field(None, description="输入定义(JSON)") + outputs: Optional[str] = Field(None, description="输出定义(JSON)") + runtime: Optional[str] = Field(None, description="运行时配置(JSON)") + settings: Optional[str] = Field(None, description="算子设置(JSON)") + file_name: Optional[str] = Field(None, description="文件名") + file_size: Optional[int] = Field(None, description="文件大小(字节)") + metrics: Optional[str] = Field(None, description="算子指标(JSON)") + usage_count: Optional[int] = Field(None, description="使用次数") + is_star: Optional[bool] = Field(None, description="是否收藏") + categories: Optional[List[str]] = Field(None, description="分类ID列表") + overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值") + requirements: Optional[List[str]] = Field(None, description="Python 依赖列表") + readme: Optional[str] = Field(None, description="README 内容") + releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表") diff --git a/runtime/datamate-python/app/module/operator/schema/release.py b/runtime/datamate-python/app/module/operator/schema/release.py new file mode 100644 index 00000000..f91297ee --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/release.py @@ -0,0 +1,22 @@ +""" +Operator Release Schemas +算子发布版本 Schema 定义 +""" +from __future__ import annotations + +from typing import List, Optional +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel + + +class OperatorReleaseDto(BaseResponseModel): + """算子发布版本 DTO""" + id: str = Field(..., description="算子ID") + version: str = Field(..., description="版本号") + release_date: Optional[datetime] = Field(None, description="发布时间") + changelog: Optional[List[str]] = Field(None, description="更新日志列表") + + +__all__ = ["OperatorReleaseDto"] diff --git a/runtime/datamate-python/app/module/operator/service/__init__.py b/runtime/datamate-python/app/module/operator/service/__init__.py new file mode 100644 index 00000000..3e1c1d0c --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/__init__.py @@ -0,0 +1,11 @@ +""" +Operator Market Services +算子市场服务层 +""" +from .operator_service import OperatorService +from .category_service import CategoryService + +__all__ = [ + "OperatorService", + "CategoryService", +] diff --git a/runtime/datamate-python/app/module/operator/service/category_service.py b/runtime/datamate-python/app/module/operator/service/category_service.py new file mode 100644 index 00000000..47a654b6 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/category_service.py @@ -0,0 +1,101 @@ +""" +Category Service +分类服务层 +""" +from typing import List + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.module.operator.repository import ( + CategoryRepository, + CategoryRelationRepository, +) +from app.module.operator.schema import ( + CategoryDto, + CategoryTreeResponse, + CategoryTreePagedResponse, +) +from app.db.models.operator import Operator +from app.module.operator.repository.operator_repository import OperatorRepository + + +class CategoryService: + """分类服务""" + + def __init__( + self, + category_repo: CategoryRepository, + category_relation_repo: CategoryRelationRepository, + operator_repo: OperatorRepository, + ): + self.category_repo = category_repo + self.category_relation_repo = category_relation_repo + self.operator_repo = operator_repo + + async def get_all_categories( + self, + db: AsyncSession + ) -> CategoryTreePagedResponse: + """获取所有分类(树状结构)""" + # Get all categories + all_categories = await self.category_repo.find_all(db) + category_map = {c.id: c for c in all_categories} + + # Get all relations and count operators per category + all_relations = await self.category_relation_repo.find_all(db) + relation_map = {} + for rel in all_relations: + if rel.category_id not in relation_map: + relation_map[rel.category_id] = 0 + relation_map[rel.category_id] += 1 + + # Group by parent_id + grouped_by_parent = {} + for cat in all_categories: + if cat.parent_id != "0": + if cat.parent_id not in grouped_by_parent: + grouped_by_parent[cat.parent_id] = [] + grouped_by_parent[cat.parent_id].append(cat) + + # Build category trees + parent_ids = sorted( + grouped_by_parent.keys(), + key=lambda pid: category_map[pid].created_at or 0 + ) + + category_trees = [] + for parent_id in parent_ids: + group = grouped_by_parent[parent_id] + parent_category = category_map[parent_id] + + # Build DTOs for children + child_dtos = [] + total_count = 0 + for cat in sorted(group, key=lambda c: c.created_at or 0): + cat_dto = CategoryDto( + id=cat.id, + name=cat.name, + value=cat.value, + type=cat.type, + parent_id=cat.parent_id, + count=relation_map.get(cat.id, 0), + created_at=cat.created_at, + ) + child_dtos.append(cat_dto) + total_count += cat_dto.count + + tree = CategoryTreeResponse( + id=parent_id, + name=parent_category.name, + count=total_count, + categories=child_dtos, + ) + category_trees.append(tree) + + # Get star count + star_count = await self.operator_repo.count_by_star(True, db) + + return CategoryTreePagedResponse( + star_count=star_count, + categories=category_trees, + ) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py new file mode 100644 index 00000000..17127e58 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -0,0 +1,599 @@ +""" +Operator Service +算子服务层 +""" +import json +import os +import uuid +import shutil +from pathlib import Path +from typing import List, Optional, Dict, Any, TYPE_CHECKING + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, text, func + +from app.core.logging import get_logger +from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, +) +from app.module.operator.schema import ( + OperatorDto, + OperatorUpdateDto, + OperatorReleaseDto, +) +from app.module.operator.parsers import ParserHolder +from app.module.operator.constants import ( + OPERATOR_BASE_PATH, + UPLOAD_DIR, + EXTRACT_DIR, + YAML_PATH, + SERVICE_ID, +) +from app.module.operator.exceptions import ( + SettingsParseError, + OperatorInInstanceError, + CannotDeletePredefinedOperatorError, +) +from app.module.shared.file_service import FileService +from app.module.shared.file_models import ( + ChunkUploadRequestDto, + FileUploadResult, +) + +logger = get_logger(__name__) + + +class OperatorService: + """算子服务""" + + def __init__( + self, + operator_repo: OperatorRepository, + category_relation_repo: CategoryRelationRepository, + operator_release_repo: OperatorReleaseRepository, + parser_holder: ParserHolder, + file_service: FileService, + ): + self.operator_repo = operator_repo + self.category_relation_repo = category_relation_repo + self.operator_release_repo = operator_release_repo + self.parser_holder = parser_holder + self.file_service = file_service + + async def get_operators( + self, + page: int, + size: int, + categories: List[List[str]], + keyword: Optional[str], + is_star: Optional[bool], + db: AsyncSession + ) -> List[OperatorDto]: + """查询算子列表(分页)""" + offset = page * size + + # Build query with categories filter + conditions = [] + params = {"limit": size, "offset": offset} + + if is_star is not None: + conditions.append("ov.is_star = :is_star") + params["is_star"] = is_star + + if keyword: + conditions.append( + "(ov.operator_name ILIKE :keyword OR ov.description ILIKE :keyword)" + ) + params["keyword"] = f"%{keyword}%" + + where_clause = "" + if conditions: + where_clause = "WHERE " + " AND ".join(conditions) + + # Handle categories grouping + group_by = "GROUP BY ov.operator_id, ov.operator_name, ov.description, ov.version, " \ + "ov.inputs, ov.outputs, ov.runtime, ov.settings, ov.is_star, " \ + "ov.file_size, ov.usage_count, ov.created_at, ov.updated_at, ov.created_by, ov.updated_by" + + having_clause = "" + if categories: + # Flatten all category IDs for IN clause + all_category_ids = [cat_id for sublist in categories for cat_id in sublist] + if all_category_ids: + where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)" + params["category_ids"] = all_category_ids + + # Build HAVING clause for category groups + having_clauses = [] + for i, cat_group in enumerate(categories): + cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group]) + having_clauses.append( + f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0" + ) + having_clause = "HAVING " + " AND ".join(having_clauses) + + query = f""" + SELECT + ov.operator_id AS id, + ov.operator_name AS name, + ov.description, + ov.version, + ov.inputs, + ov.outputs, + ov.runtime, + ov.settings, + ov.is_star, + ov.file_size, + ov.usage_count, + ov.created_at, + ov.updated_at + FROM v_operator ov + {where_clause} + {group_by} + {having_clause} + ORDER BY ov.created_at DESC + LIMIT :limit OFFSET :offset + """ + + result = await db.execute(text(query), params) + rows = result.fetchall() + + # Convert to DTOs + operators = [] + for row in rows: + operators.append(OperatorDto( + id=row.id, + name=row.name, + description=row.description, + version=row.version, + inputs=row.inputs, + outputs=row.outputs, + runtime=row.runtime, + settings=row.settings, + file_name=None, + file_size=row.file_size, + metrics=None, + usage_count=row.usage_count, + is_star=row.is_star, + created_at=row.created_at, + updated_at=row.updated_at, + )) + + return operators + + async def count_operators( + self, + categories: List[List[str]], + keyword: Optional[str], + is_star: Optional[bool], + db: AsyncSession + ) -> int: + """统计算子数量""" + conditions = [] + params = {} + + if is_star is not None: + conditions.append("is_star = :is_star") + params["is_star"] = is_star + + if keyword: + conditions.append( + "(operator_name ILIKE :keyword OR description ILIKE :keyword)" + ) + params["keyword"] = f"%{keyword}%" + + where_clause = "" + if conditions: + where_clause = "WHERE " + " AND ".join(conditions) + + # Handle categories grouping + group_by = "GROUP BY operator_id, operator_name, description, version, inputs, outputs, " \ + "runtime, settings, is_star, file_size, usage_count, created_at, updated_at, " \ + "created_by, updated_by" + + having_clause = "" + if categories: + # Flatten all category IDs for IN clause + all_category_ids = [cat_id for sublist in categories for cat_id in sublist] + if all_category_ids: + where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)" + params["category_ids"] = all_category_ids + + # Build HAVING clause for category groups + having_clauses = [] + for i, cat_group in enumerate(categories): + cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group]) + having_clauses.append( + f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0" + ) + having_clause = "HAVING " + " AND ".join(having_clauses) + + query = f""" + SELECT COUNT(*) as count + FROM ( + SELECT operator_id + FROM v_operator + {where_clause} + {group_by} + {having_clause} + ) AS t + """ + + result = await db.execute(text(query), params) + return result.scalar() or 0 + + async def get_operator_by_id( + self, + operator_id: str, + db: AsyncSession + ) -> OperatorDto: + """根据 ID 获取算子详情""" + result = await db.execute( + text("SELECT * FROM v_operator WHERE operator_id = :operator_id"), + {"operator_id": operator_id} + ) + row = result.fetchone() + + if not row: + raise ValueError(f"Operator {operator_id} not found") + + # Build DTO + operator = OperatorDto( + id=row.operator_id, + name=row.operator_name, + description=row.description, + version=row.version, + inputs=row.inputs, + outputs=row.outputs, + runtime=row.runtime, + settings=row.settings, + file_name=row.file_name, + file_size=row.file_size, + metrics=row.metrics, + usage_count=row.usage_count, + is_star=row.is_star, + created_at=row.created_at, + updated_at=row.updated_at, + ) + + # Read requirements and readme if file exists + if row.file_name: + extract_path = self._get_extract_path( + self._get_stem(row.file_name) + ) + operator.requirements = self._read_requirements(extract_path) + operator.readme = self._get_readme_content(extract_path) + + operator.file_name = None # Don't return file_name + + # Load releases + releases = await self.operator_release_repo.find_all_by_operator_id( + operator_id, db + ) + operator.releases = [ + OperatorReleaseDto( + id=release.id, + version=release.version, + release_date=release.release_date, + changelog=release.changelog + ) + for release in releases + ] + + return operator + + async def create_operator( + self, + req: OperatorDto, + db: AsyncSession + ) -> OperatorDto: + """创建算子""" + from datetime import datetime, timezone + + # Generate ID if not provided + if not req.id: + req.id = str(uuid.uuid4()) + + # Override settings + self._override_settings(req) + + # Insert operator + await self.operator_repo.insert(req, db) + + # Insert category relations + if req.categories: + await self.category_relation_repo.batch_insert( + req.id, req.categories, db + ) + + # Insert release + if req.releases: + release = req.releases[0] + release.id = req.id + release.version = req.version + release.release_date = datetime.now(timezone.utc) + await self.operator_release_repo.insert(release, db) + + # Extract files + if req.file_name: + self.parser_holder.extract_to( + self._get_file_type(req.file_name), + self._get_upload_path(req.file_name), + self._get_extract_path(self._get_stem(req.file_name)) + ) + + await db.flush() + return await self.get_operator_by_id(req.id, db) + + async def update_operator( + self, + operator_id: str, + req: OperatorUpdateDto, + db: AsyncSession + ) -> OperatorDto: + """更新算子""" + from datetime import datetime, timezone + + # Get existing operator + existing = await self.get_operator_by_id(operator_id, db) + + # Merge update request into existing operator + # Only update fields that are provided (not None) + if req.name is not None: + existing.name = req.name + if req.description is not None: + existing.description = req.description + if req.version is not None: + existing.version = req.version + if req.inputs is not None: + existing.inputs = req.inputs + if req.outputs is not None: + existing.outputs = req.outputs + if req.runtime is not None: + existing.runtime = req.runtime + if req.settings is not None: + existing.settings = req.settings + if req.file_name is not None: + existing.file_name = req.file_name + if req.file_size is not None: + existing.file_size = req.file_size + if req.metrics is not None: + existing.metrics = req.metrics + if req.usage_count is not None: + existing.usage_count = req.usage_count + if req.is_star is not None: + existing.is_star = req.is_star + if req.categories is not None: + existing.categories = req.categories + if req.overrides is not None: + existing.overrides = req.overrides + + # Override settings + self._override_settings(existing) + + # Update operator + await self.operator_repo.update(existing, db) + + # Update category relations + if req.categories is not None: + await self.category_relation_repo.batch_update( + operator_id, req.categories, db + ) + + # Update release + logger.info(f"########### {req.releases}") + if req.releases is not None and len(req.releases) > 0: + release = req.releases[0] + if release.version is None: + release.version = existing.version + release.id = operator_id + release.release_date = datetime.now(timezone.utc) + if existing.version == release.version: + await self.operator_release_repo.update(release, db) + else: + await self.operator_release_repo.insert(release, db) + + # Extract files + if req.file_name is not None: + self.parser_holder.extract_to( + self._get_file_type(req.file_name), + self._get_upload_path(req.file_name), + self._get_extract_path(self._get_stem(req.file_name)) + ) + + await db.flush() + return await self.get_operator_by_id(operator_id, db) + + async def delete_operator( + self, + operator_id: str, + db: AsyncSession + ) -> None: + """删除算子""" + # Check if operator is in use + in_template = await self.operator_repo.operator_in_template(operator_id, db) + in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db) + if in_template and in_unstop_task: + raise OperatorInInstanceError() + + # Check if operator is predefined + is_predefined = await self.category_relation_repo.operator_is_predefined( + operator_id, db + ) + if is_predefined: + raise CannotDeletePredefinedOperatorError() + + # Get operator for file cleanup + operator = await self.get_operator_by_id(operator_id, db) + + # Delete from database + await self.operator_repo.delete(operator_id, db) + await self.category_relation_repo.delete_by_operator_id(operator_id, db) + await self.operator_release_repo.delete(operator_id, db) + + # Delete extracted files + if operator.file_name: + extract_path = self._get_extract_path(self._get_stem(operator.file_name)) + shutil.rmtree(extract_path, ignore_errors=True) + + async def upload_operator( + self, + file_name: str, + db: AsyncSession + ) -> OperatorDto: + """上传算子文件并解析元数据""" + return self.parser_holder.parse_yaml_from_archive( + self._get_file_type(file_name), + self._get_upload_path(file_name), + YAML_PATH + ) + + async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: + """预上传,返回请求 ID""" + from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR + + upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR) + req_id = await self.file_service.pre_upload( + upload_path=upload_path, + service_id=SERVICE_ID, + check_info=None + ) + return {"req_id": req_id} + + async def chunk_upload( + self, + req_id: str, + file_no: int, + file_name: str, + total_chunk_num: int, + chunk_no: int, + check_sum_hex: Optional[str], + file_content: bytes, + db: AsyncSession + ) -> FileUploadResult: + """分块上传文件""" + from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR + + upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR) + + chunk_request = ChunkUploadRequestDto( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + ) + + return await self.file_service.chunk_upload( + chunk_request, upload_path, file_content, db + ) + + def download_example_operator(self, file_path: str) -> Path: + """下载示例算子文件""" + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + return path + + def _override_settings(self, operator: OperatorDto) -> None: + """用 overrides 值覆盖 settings 的 defaultVal""" + if not operator.settings or not operator.overrides: + return + + try: + settings = json.loads(operator.settings) + for key, value in operator.overrides.items(): + if key not in settings: + continue + + setting = settings[key] + setting_type = setting.get("type") + + match setting_type: + case "slider" | "switch" | "select" | "input" | "radio": + setting["defaultVal"] = value + case "checkbox": + setting["defaultVal"] = self._convert_to_list_string(value) + case "range": + self._update_properties(setting, value) + + settings[key] = setting + + operator.settings = json.dumps(settings) + except json.JSONDecodeError as e: + raise SettingsParseError(str(e)) + + def _convert_to_list_string(self, value: Any) -> str: + """转换为逗号分隔的字符串""" + if value is None: + return "" + if isinstance(value, list): + return ",".join(str(v) for v in value) + return str(value) + + def _update_properties(self, setting: Dict[str, Any], value: Any) -> None: + """更新 range 类型的 properties""" + if not isinstance(value, list): + return + + properties = setting.get("properties", []) + if not isinstance(properties, list) or len(properties) != len(value): + return + + for i, prop in enumerate(properties): + if isinstance(prop, dict): + prop["defaultVal"] = value[i] + + setting["properties"] = properties + + def _read_requirements(self, extract_path: str) -> List[str]: + """读取 requirements.txt""" + requirements_path = Path(extract_path) / "requirements.txt" + if not requirements_path.exists(): + return [] + + requirements = [] + try: + with open(requirements_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + requirements.append(line) + except Exception as e: + logger.warning(f"Failed to read requirements: {e}") + return requirements + + def _get_readme_content(self, extract_path: str) -> str: + """读取 README 内容""" + dir_path = Path(extract_path) + if not dir_path.exists(): + return "" + + candidates = ["README.md", "readme.md", "Readme.md"] + for filename in candidates: + readme_path = dir_path / filename + if readme_path.exists(): + try: + return readme_path.read_text(encoding='utf-8') + except Exception as e: + logger.warning(f"Failed to read README: {e}") + return "" + + def _get_file_type(self, file_name: str) -> str: + """获取文件类型(扩展名)""" + return file_name.rsplit('.', 1)[-1].lower() if '.' in file_name else "" + + def _get_stem(self, file_name: str) -> str: + """获取文件名不含扩展名""" + return file_name.rsplit('.', 1)[0] if '.' in file_name else file_name + + def _get_upload_path(self, file_name: str) -> str: + """获取上传文件路径""" + return os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR, file_name) + + def _get_extract_path(self, file_stem: str) -> str: + """获取解压路径""" + return os.path.join(OPERATOR_BASE_PATH, EXTRACT_DIR, file_stem) diff --git a/runtime/datamate-python/app/module/shared/__init__.py b/runtime/datamate-python/app/module/shared/__init__.py index e69de29b..fd0d7a1a 100644 --- a/runtime/datamate-python/app/module/shared/__init__.py +++ b/runtime/datamate-python/app/module/shared/__init__.py @@ -0,0 +1,21 @@ +""" +Shared Module Init +共享模块初始化 +""" +from .file_service import FileService +from .file_models import ( + ChunkUploadPreRequestDto, + ChunkUploadRequestDto, + FileUploadResult, +) +from .chunks_saver import ChunksSaver +from .chunk_upload_repository import ChunkUploadRepository + +__all__ = [ + "FileService", + "ChunkUploadPreRequestDto", + "ChunkUploadRequestDto", + "FileUploadResult", + "ChunksSaver", + "ChunkUploadRepository", +] diff --git a/runtime/datamate-python/app/module/shared/chunk_upload_repository.py b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py new file mode 100644 index 00000000..8a0c717d --- /dev/null +++ b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py @@ -0,0 +1,95 @@ +""" +Chunk Upload Repository +分片上传数据访问层 +""" +from typing import Optional, List + +from sqlalchemy import select, update, delete +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.chunk_upload import ChunkUploadPreRequest +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +class ChunkUploadRepository: + """分片上传数据访问层""" + + async def find_by_id( + self, + req_id: str, + db: AsyncSession + ) -> Optional[ChunkUploadPreRequest]: + """根据ID查询""" + result = await db.execute( + select(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id) + ) + return result.scalar_one_or_none() + + async def find_by_service_id( + self, + service_id: str, + db: AsyncSession + ) -> List[ChunkUploadPreRequest]: + """根据服务ID查询""" + result = await db.execute( + select(ChunkUploadPreRequest).where( + ChunkUploadPreRequest.service_id == service_id + ) + ) + return result.scalars().all() + + async def find_all(self, db: AsyncSession) -> List[ChunkUploadPreRequest]: + """查询所有""" + result = await db.execute(select(ChunkUploadPreRequest)) + return result.scalars().all() + + async def insert( + self, + request: ChunkUploadPreRequest, + db: AsyncSession + ) -> None: + """插入""" + db.add(request) + + async def update( + self, + request: ChunkUploadPreRequest, + db: AsyncSession + ) -> int: + """更新""" + from datetime import datetime, timezone + result = await db.execute( + update(ChunkUploadPreRequest) + .where(ChunkUploadPreRequest.id == request.id) + .values( + uploaded_file_num=request.uploaded_file_num, + timeout=request.timeout, + ) + ) + return result.rowcount + + async def delete_by_id( + self, + req_id: str, + db: AsyncSession + ) -> int: + """根据ID删除""" + result = await db.execute( + delete(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id) + ) + return result.rowcount + + async def delete_by_service_id( + self, + service_id: str, + db: AsyncSession + ) -> int: + """根据服务ID删除""" + result = await db.execute( + delete(ChunkUploadPreRequest).where( + ChunkUploadPreRequest.service_id == service_id + ) + ) + return result.rowcount diff --git a/runtime/datamate-python/app/module/shared/chunks_saver.py b/runtime/datamate-python/app/module/shared/chunks_saver.py new file mode 100644 index 00000000..554b263b --- /dev/null +++ b/runtime/datamate-python/app/module/shared/chunks_saver.py @@ -0,0 +1,146 @@ +""" +Chunks Saver +分片保存器,用于处理文件分片上传 +""" +import os +from pathlib import Path +from typing import Optional +from datetime import datetime, timezone + +from fastapi import UploadFile + +from app.core.logging import get_logger +from app.module.shared.file_models import ChunkUploadRequestDto + +logger = get_logger(__name__) + + +class ChunksSaver: + """分片保存器""" + + TEMP_DIR_NAME_FORMAT = "req_%s_chunks" + + @staticmethod + def save( + file_upload_request: ChunkUploadRequestDto, + pre_upload_req_id: str, + upload_path: str, + file_content: bytes + ) -> Optional[Path]: + """ + 保存分片 + + Args: + file_upload_request: 上传分片的请求 + pre_upload_req_id: 预上传请求ID + upload_path: 上传基础路径 + file_content: 文件内容(字节) + + Returns: + 保存后的文件路径,如果不是最后一个分片则返回None + """ + start_time = datetime.now(timezone.utc) + + temp_dir = Path(upload_path) / ( + ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_upload_req_id + ) + temp_dir.mkdir(parents=True, exist_ok=True) + + temp_file = temp_dir / str(file_upload_request.file_no) + + ChunksSaver._append_to_target_file(temp_file, file_content) + + if file_upload_request.total_chunk_num != file_upload_request.chunk_no: + elapsed = (datetime.now(timezone.utc) - start_time).total_seconds() + logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s") + return None + + final_file = Path(upload_path) / file_upload_request.file_name + + try: + temp_file.rename(final_file) + except OSError as e: + logger.error( + f"failed to mv file: {temp_file.name}, req id: {pre_upload_req_id}, error: {e}" + ) + raise ValueError("failed to move file to target dir") from e + + elapsed = (datetime.now(timezone.utc) - start_time).total_seconds() + logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s") + + return final_file + + @staticmethod + def save_file( + file_upload_request: ChunkUploadRequestDto, + upload_path: str, + file_content: bytes + ) -> Path: + """ + 保存文件(不分片) + + Args: + file_upload_request: 上传请求 + upload_path: 上传路径 + file_content: 文件内容(字节) + + Returns: + 保存后的文件路径 + """ + target_file = Path(upload_path) / file_upload_request.file_name + + logger.info(f"file path {target_file}, file size {len(file_content)}") + + try: + target_file.parent.mkdir(parents=True, exist_ok=True) + target_file.write_bytes(file_content) + except OSError as e: + logger.error(f"failed to save file: {target_file}, error: {e}") + raise ValueError("failed to save file") from e + + return target_file + + @staticmethod + def delete_folder(folder_path: str) -> None: + """ + 删除指定路径下的所有文件 + + Args: + folder_path: 文件夹路径 + """ + folder = Path(folder_path) + + if not folder.exists(): + logger.info(f"folder {folder_path} does not exist") + return + + try: + for item in folder.glob("*"): + if item.is_file(): + item.unlink() + elif item.is_dir(): + for sub_item in item.glob("*"): + if sub_item.is_file(): + sub_item.unlink() + elif sub_item.is_dir(): + ChunksSaver.delete_folder(str(sub_item)) + item.rmdir() + except OSError as e: + logger.error(f"failed to delete folder: {folder_path}, error: {e}") + raise ValueError("failed to delete folder") from e + + @staticmethod + def _append_to_target_file(target_file: Path, content: bytes) -> None: + """ + 追加内容到目标文件末尾 + + Args: + target_file: 目标文件 + content: 要追加的内容 + """ + try: + with open(target_file, "ab") as f: + f.write(content) + except OSError as e: + logger.error(f"failed to append to file: {target_file}, error: {e}") + raise ValueError("failed to append content to file") from e diff --git a/runtime/datamate-python/app/module/shared/file_models.py b/runtime/datamate-python/app/module/shared/file_models.py new file mode 100644 index 00000000..c4e98775 --- /dev/null +++ b/runtime/datamate-python/app/module/shared/file_models.py @@ -0,0 +1,38 @@ +""" +File Models +文件相关模型定义 +""" +from pathlib import Path +from typing import Optional +from pydantic import BaseModel, Field +from datetime import datetime + + +class ChunkUploadPreRequestDto(BaseModel): + """分片上传预请求DTO""" + id: str = Field(..., description="请求ID") + total_file_num: int = Field(..., description="总文件数", ge=1) + uploaded_file_num: Optional[int] = Field(None, description="已上传文件数", ge=0) + upload_path: str = Field(..., description="文件路径") + timeout: Optional[datetime] = Field(None, description="上传请求超时时间") + service_id: Optional[str] = Field(None, description="上传请求所属服务ID") + check_info: Optional[str] = Field(None, description="业务信息") + + +class ChunkUploadRequestDto(BaseModel): + """分片上传请求DTO""" + req_id: str = Field(..., description="预上传返回的ID") + file_no: int = Field(1, description="文件编号", ge=1) + file_name: str = Field(..., description="文件名称") + total_chunk_num: int = Field(1, description="总分块数量", ge=1) + chunk_no: int = Field(1, description="当前分块编号", ge=1) + file_size: Optional[int] = Field(None, description="文件大小", ge=0) + check_sum_hex: Optional[str] = Field(None, description="文件校验和(十六进制字符串)") + + +class FileUploadResult(BaseModel): + """文件上传结果""" + is_all_files_uploaded: bool = Field(..., description="是否所有文件已上传") + check_info: Optional[str] = Field(None, description="业务上传信息") + saved_file_path: Optional[str] = Field(None, description="保存的文件路径") + file_name: str = Field(..., description="文件名称") diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py new file mode 100644 index 00000000..1c859c85 --- /dev/null +++ b/runtime/datamate-python/app/module/shared/file_service.py @@ -0,0 +1,187 @@ +""" +File Service +文件服务,处理文件上传、分片上传等功能 +""" +import os +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +from fastapi import UploadFile + +from app.core.logging import get_logger +from app.module.shared.file_models import ( + ChunkUploadPreRequestDto, + ChunkUploadRequestDto, + FileUploadResult, +) +from app.module.shared.chunks_saver import ChunksSaver +from app.module.shared.chunk_upload_repository import ChunkUploadRepository +from app.db.models.chunk_upload import ChunkUploadPreRequest + +logger = get_logger(__name__) + + +class FileService: + """文件服务""" + + DEFAULT_TIMEOUT_SECONDS = 120 + + def __init__( + self, + chunk_upload_repo: ChunkUploadRepository, + ): + self.chunk_upload_repo = chunk_upload_repo + + async def pre_upload( + self, + upload_path: str, + service_id: str, + check_info: Optional[str] = None + ) -> str: + """ + 预上传 + + Args: + upload_path: 上传路径 + service_id: 服务ID + check_info: 业务信息 + + Returns: + 预上传请求ID + """ + req_id = str(uuid.uuid4()) + timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + + pre_request = ChunkUploadPreRequest( + id=req_id, + total_file_num=1, + uploaded_file_num=0, + upload_path=upload_path, + timeout=timeout, + service_id=service_id, + check_info=check_info, + ) + + await self.chunk_upload_repo.insert(pre_request) + return req_id + + async def chunk_upload( + self, + upload_request: ChunkUploadRequestDto, + upload_path: str, + file_content: bytes, + db_session, + ) -> FileUploadResult: + """ + 分片上传 + + Args: + upload_request: 上传请求 + upload_path: 上传路径 + file_content: 文件内容 + db_session: 数据库会话 + + Returns: + 上传结果 + """ + upload_request.file_size = len(file_content) + + pre_request = await self.chunk_upload_repo.find_by_id( + upload_request.req_id, db_session + ) + + if pre_request is None: + logger.error(f"pre-upload request not found: {upload_request.req_id}") + raise ValueError("Pre-upload request not found") + + if pre_request.is_upload_complete(): + logger.error(f"upload already complete: {upload_request.req_id}") + raise ValueError("Upload already complete") + + if pre_request.is_request_timeout(): + logger.error(f"upload request timeout: {upload_request.req_id}") + raise ValueError("Upload request timeout") + + saved_file_path = None + + if upload_request.total_chunk_num > 1: + saved_file_path = await self._upload_chunk( + upload_request, pre_request, upload_path, file_content + ) + else: + saved_file_path = await self._upload_file( + upload_request, pre_request, upload_path, file_content + ) + + update_count = await self.chunk_upload_repo.update(pre_request, db_session) + + if update_count == 0: + logger.error(f"failed to update pre-request: {upload_request.req_id}") + raise ValueError("Failed to update pre-upload request") + + is_finish = pre_request.uploaded_file_num == pre_request.total_file_num + + if is_finish: + temp_dir = os.path.join( + upload_path, + ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_request.id + ) + try: + ChunksSaver.delete_folder(temp_dir) + except Exception as e: + logger.warning(f"failed to delete temp dir: {temp_dir}, error: {e}") + + await self.chunk_upload_repo.delete_by_id(pre_request.id, db_session) + + return FileUploadResult( + is_all_files_uploaded=is_finish, + check_info=pre_request.check_info, + saved_file_path=str(saved_file_path) if saved_file_path else None, + file_name=upload_request.file_name, + ) + + async def _upload_file( + self, + upload_request: ChunkUploadRequestDto, + pre_request: ChunkUploadPreRequest, + upload_path: str, + file_content: bytes + ) -> Path: + """上传单文件""" + saved_file = ChunksSaver.save_file( + upload_request, upload_path, file_content + ) + + from datetime import timezone + pre_request.timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + pre_request.increment_uploaded_file_num() + + return saved_file + + async def _upload_chunk( + self, + upload_request: ChunkUploadRequestDto, + pre_request: ChunkUploadPreRequest, + upload_path: str, + file_content: bytes + ) -> Optional[Path]: + """上传分片""" + saved_file = ChunksSaver.save( + upload_request, pre_request.id, upload_path, file_content + ) + + if saved_file is not None: + pre_request.increment_uploaded_file_num() + return saved_file + + from datetime import timezone + pre_request.timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + return None diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 0587b841..c85380a0 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -49,6 +49,10 @@ CREATE TABLE IF NOT EXISTS t_operator_release version VARCHAR(255), release_date TIMESTAMP, changelog JSON, + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id, version) ); @@ -60,7 +64,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category value VARCHAR(64) UNIQUE, type VARCHAR(64), parent_id VARCHAR(64), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); COMMENT ON TABLE t_operator_category IS '算子分类表'; @@ -76,6 +83,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category_relation ( category_id VARCHAR(64), operator_id VARCHAR(64), + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (category_id, operator_id) ); diff --git a/scripts/images/backend-python/Dockerfile b/scripts/images/backend-python/Dockerfile index 4d276dd0..826a0531 100644 --- a/scripts/images/backend-python/Dockerfile +++ b/scripts/images/backend-python/Dockerfile @@ -55,6 +55,7 @@ ENV NLTK_DATA=/usr/local/nltk_data # Copy the rest of the application COPY runtime/datamate-python /app +COPY runtime/ops/examples/test_operator/test_operator.tar /app/test_operator.tar COPY --from=datax-builder /DataX/target/datax/datax /opt/datax RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/ From 09b8564f39362871b75105f162b0cc3c503edd2e Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 4 Feb 2026 10:11:50 +0800 Subject: [PATCH 03/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/module/operator/service/operator_service.py | 1 + .../app/module/shared/file_service.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 17127e58..094c49f5 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -458,6 +458,7 @@ async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: req_id = await self.file_service.pre_upload( upload_path=upload_path, service_id=SERVICE_ID, + db_session=db, check_info=None ) return {"req_id": req_id} diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py index 1c859c85..e51db024 100644 --- a/runtime/datamate-python/app/module/shared/file_service.py +++ b/runtime/datamate-python/app/module/shared/file_service.py @@ -4,7 +4,7 @@ """ import os import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Optional @@ -38,6 +38,7 @@ async def pre_upload( self, upload_path: str, service_id: str, + db_session, check_info: Optional[str] = None ) -> str: """ @@ -54,7 +55,7 @@ async def pre_upload( req_id = str(uuid.uuid4()) timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) pre_request = ChunkUploadPreRequest( id=req_id, @@ -66,7 +67,7 @@ async def pre_upload( check_info=check_info, ) - await self.chunk_upload_repo.insert(pre_request) + await self.chunk_upload_repo.insert(pre_request, db_session) return req_id async def chunk_upload( @@ -156,10 +157,9 @@ async def _upload_file( upload_request, upload_path, file_content ) - from datetime import timezone pre_request.timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) pre_request.increment_uploaded_file_num() return saved_file @@ -180,8 +180,7 @@ async def _upload_chunk( pre_request.increment_uploaded_file_num() return saved_file - from datetime import timezone pre_request.timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) return None From 492cd00cc507f05a89b18deef2439e01578d56f1 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 4 Feb 2026 15:11:54 +0800 Subject: [PATCH 04/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/docker/datamate/docker-compose.yml | 2 + .../Detail/components/ChangeLog.tsx | 2 +- .../pages/OperatorMarket/operator.const.tsx | 6 +- .../app/db/models/chunk_upload.py | 8 +-- .../operator/interface/operator_routes.py | 67 ++++++++++--------- .../operator/parsers/abstract_parser.py | 31 +++++++-- .../module/operator/parsers/parser_holder.py | 13 +++- .../app/module/operator/parsers/tar_parser.py | 10 ++- .../app/module/operator/parsers/zip_parser.py | 11 ++- .../category_relation_repository.py | 16 ++--- .../app/module/operator/schema/operator.py | 2 +- .../operator/service/operator_service.py | 65 ++++++++++++------ .../app/module/shared/file_service.py | 17 ++--- scripts/db/data-operator-init.sql | 3 - 14 files changed, 157 insertions(+), 96 deletions(-) diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index 9f3ec006..eb49a0ce 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -35,6 +35,8 @@ services: - flow_volume:/flow - log_volume:/var/log/datamate - graph_data_volume:/data/rag_storage + - operator-upload-volume:/operators/upload + - operator-runtime-volume:/operators/extract networks: [ datamate ] depends_on: - datamate-database diff --git a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx index c9e77ad0..cf3f100c 100644 --- a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx +++ b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx @@ -20,7 +20,7 @@ export default function ChangeLog({ operator }) { )}
e>+Ji z5^pTv@}>x#A3BEk{Cf|7T;6$WH!<`pFJcqc#gxF_U~@>6ZOY3ixI`4hce-eTFLbHI zW!$U~Lv4jZRB~dlVD56^$uuz{t#~swM8P~p;DnF58&%)(E`$NWs{2Ms5@t7Q@9C>7 zJC$F}`gpOz@^W;mAYB%;Z*7I*(6#Rc^{rT;S(mQlkH&Pr@KQfz(Q3e?IGW(}%ygnWT2 zG!(+iva2#YP(i#l+rKqbIJ#F5m+IQ(((RScX0`Z-f{yZ1a+Aras(Ndv(EA#gbE3j5 zE3Ygpw`nS?wA>{3#%0{)EM`@cmx;CD7`i9}&%>xFhgi!iFjQ0SoS&-7-+KYJO7izx zL0J)C88=d4`IfoS(cMV`W1dQNXz-(1J^rEarL^dkQcC58TgHo{yAv;Rl68 L z|NbUlz#;!l_#L`h{?Ec-^}fCP>f08o0F1kD`sFt(#aHcXZ!%tauPbl%dqet G$e4bqNnDnru*}v-SJtd%JG|O;IxEYx{x!Ux(RDhLwQ5)C@ast8UhM~VmCj7m zlwF~#o|gRo3)$1^yp*;5d)>tTuM__Vg2Ml={r>m%)1m@mE&i`S$uFtFivO#p2R4=e ziu(VM-``w||Nk0%XsiFR{r-2r+tSkPul@e_UxNZa?HtvUdTU>s$-lSBS^ppKhrBiW qZ(*={-{dX*kM{dLeox3nSNG|usd{tPAlEQZ!$1uKH4Oap82CRO4E6N@ literal 5077 zcmV;`6DsT RH%U?UiF`1cSe7Ck0iNPPHh9A;!3&EGIAAb~Wm9ZI@-i>;Hm@^3AXj}P)jUu& z&-Y=z?yt|HV|n9ZlMCv?WuJ5U`}*r|@9wX0HXJ)t|H+X<&QMduRyCK)<*%tB?02o- z$A7y#`0T>Jy~OQvdwsPokE_;8Ty9TIZA}GnZ5c(D7LTcN6i7K03#{1>x}7|^#&?_w zF@FEGw7>a>k19T_s5l}A$d|{-ABkw}UB%zQj|YB!f* )0gfuG;m zx_n^1|9v1Fa>|iNi{gw#!>5&2xiz3veDGn#M`Pi{@BXFvzu&+l%WbuGR6hHeA~z|~ zJzM7o*0q1P%Vz&I9*?JjoPFbzlxXj{{d-*`qy|IEes`^ZUrmk6>-IUlKKH)by?%eK z WW%mn3cCkv7(?P2JYUXXwOCHkB5=5}n%6aCYf>HkF{$ H1u z%w_Gy1*Afffck9efkC~MP}<-5YN6+-9QaXgR$eXi()0+tqQeh?v}~$RyOpBL=Ycob z%ySqdKldUxmCP 1o Im}7!}+Oxtv6AaNH94*dX!D|YGW_S(Khu|xRqp=CiBn6 z0O^Q8YB)bqL{$+|dhsTxUViSWc4ZKafD22KcJJQJmCEBJ21~N3L7Y0Nw6d}aK&;P= z1o@Rs>sB!_V^!Z@6BGF~sMzAKiAmrx?H$NHoYSr>#nG2v|%tC NrTjGQAlJ+fXqM}UItQXe@7z%YMu zZubV7=n5c>t#U|dWN1*}d`4g*xqI{LjH^BC%g>D>7@}%YVu5Hdq6WjQ7K}qf`Q^27 zF>LwetJ;-0VfGfxz%oSMBDXfj!TL 4 zb!^30;yp(jmCrN;n;6pSYTEgvFp@+GR97>6%bkU(3}_H(G_9qPphn~#XP7HVf@OpI zP2{_u0+O$kn0f@VDkX~OPb<-wU UH7D!PX> z23sR>HOBu8$5s3rWWnbPM4`@WGgDfkom4XSwI>*2K1+@_1;cwz2b+{|701rL0dk>^ z9>G(jO}qY78=Ke0r|I%Isf;QuiX2nGVv2$i=QZ}vFP+b(E?Qw}S0_!r)ZJ*C9BPpS znmUP)-NaG2JElFFV(vV3-hr=h5?~gLJ3M}9mj|+o6Am f&0uS&KJ{;m>KOMB@~Xf)e*Pr$N}b? z*i2`ypnb#VKqSt-xm+P8#GT~e(XWXJNu%@Ng20RBr)IRdhw!or`Zu%wE^r3~`~NHS zrvp>a89uh!K7L{S7=w@mk` 5x=2#UVgP?`ZQ;+|cgpfwPQ=@Q<%qaCd2eT0~R~nhodR~CrM&Fzp?JrFJA{;YL zoeMsjzLQP=jH7{!MeOIQhY19;>l=Nq7OqKBH8_RiEnqWIQmJJw(Tg+M dN7>vg>LvSA>{o_%?X*-0l`Ru_cMTp8=MfR{!ISh v%-lCxDwq_{-1T&1JfZ!g9|TO(p5Y`z$_BJc4o5wg{sb+2SyilFI>E z&WSlG$6M5I VCk)ZQK zC<+(~Y|sF_xGvzGdk7d)l?a+Q0BW<-h0$dIoErkasP&BjIGgA%H^3T6B}907w}W>^ z4+0r&ZuBaPmx}{H3J48Yp?`$2;6)W$A6MctK{arSR5IJ?d}7sLaKChZ1PIaw?=cT% zs4s^=^<7inHljin@pH5eo2VUXG_E*I3xS+iyD&r;A X!RAwp3X_P7n w>#I+ zcnr}5#J4vt9Ubv%*0D?1BGkh>V8ahU1Hq&ViE(gSw5O|ZZRB+ -n7 zu3xNhAPD!~gLObQ1xdqYNCAz6lI>As?rFWtxHM;3$;;=6+`6lUgBojj>-L!>xt+jN z6U*ZaTS#6LIZ6f=l5E2vB&&)u5RNL&A7Dk)5RNnw1R;&Mcx+@2xbf3qiwb|UkvtqN zUvc7E6Y_YDS?r^T#VE$d)FAVGnOS1n;lOpPx#nfNVvzN6d~6Ue8|2VgB@kB?qK& |t!&4R9sIXY5GyNrs!_PVK%dOx)*;Lomm4g~VNzzU`HQeg!p9kV zgs|diIk}5e%hBc-{Hp%(4Esyaz NMs++mdjH60{U?2N$zI8s?vtOCMjga)9U z)Ny_O9{O{gJ^}0Ck8Q6^)zFoSHFSMeUu1l$LlX4?(4U*zLbqOklLy6xg(Xd;aMzq~ z1U3Aqw2|CwFD|l8bWuf(N5P$2?bnK{Bz-%@mLzx%T(V+t;m+VXbG*w*ZRDk;J93wA zNbMK2fA3(AjuVz%!ka5ide5eCLo5s)?i9F)N)tzw1*RAoz!W1fdK!P=gaw>!SGmSn zw4o32# Vgy_U?Rg$$HQsb_wh!I^qZll4ogEPFLdomdVon4trE_?6US@jM~2= z9EmyOF(vA>kLeU-?U1T$dggPAoC2AbP4}@?6J{28UXn{rfjMGqzT2VQm=GJ{{9|GA zhFB7(FM|YFgo? #YvDc$uk{8g1JhklEe?dMf68+LvkP0IYPi$tjUmA^y1qme-8J z^5H=?18An`)B9Tbypfq;XWT0?$ylMyVmDCV=+d|II3qA35==@kVgg0ncjzyf|88wY zCb0qZgDS47Y$?l464v#QEuw&Mg#6+oZQvKK+PWb9hQsK+49*tvGFTbGa#WIpOGx&! zGt=Qg#uc`s)TQ+%w259ks?>qfxOk?6eYu7E@J2XMjtPbX@i0F=pr2JtYqu7};Vrrn zZhPoZs;f;u!?71-t>T0fh#EE0OJh{F&5dawMvWq0x63AmUB%h~E+uz8MLVCMf|`28 zGb$sW1sN5(2l98@f%(OQv#5i_4G%!CY-#{VV9{tH!481UQ<@?N48r0kC%Q3$uLSmj z+e%&ogq&|^_ka~F%a#mtE@2?UElrRb_b2QeoGG4tEJh^fII0S$8~78xjchu$&3G3x ztrUJ`Cp#o7boUd(r-a%X0`Zs{4uJ>M{fX|-9oCNy!U_Z|^IgMHl|pJdDcW-t7{IK{ z& 1d#;M9tz=|O-fA2OR1d+0Mc{9y6zGm+r zX6?oW@GamSAeGVMN ZuDDFOB0mUcvMOghb9B*wp6pe;)1@*S8IyT-Q|2I`@ zi^kY|{jb;S@t5m=wxyNpf8Xn7*8gsuAJ}~Tug~SHDcApQM=M?b>v8)$e!s6=|NG9j zt&jfe+rQW4vB!V6t6cxPEp4^kkIij=tiIG(dzDeTm0l{S0NkK&Y#3{Ws 7|x_l_! ze^ycP_hxPok^@2o|6H-g+Td?1hX3O~|Lgky)c?Ncr|19p=T8Rqyx!Cy|LA)W?mh9p z|Cf3_1n)pI^Z%{$1MAtp&+oU{zt7|Im-GMaXhr$I&s|$vyVvRV*1A2l?!BAL|AVzn zr-R>kqkE&x8_xfA`;**W?^;7K?bs{lPz+Ir7INp?%*S`1IRTXVp{xw)fa!*XjB< z)Fj9{S0kGf@TMHkZzyNk`slx|{nym^J@)wTb(Q1)wlq;?WxWOCkcz9T*0?R>ko21} z&<~Ft91@s!Q#cAQPQ~RjO3)$UX)nC6
zw}8)L{jp{&W(C^_l(@}e0ow@FUvQ~Jzag=S+YHv8qWt0}P2aV3_JRM7A=$+s{H6$= zAKF0Np1ns76nEa(O^lKkQ)>45YW*(#G73o2XSrx%xmF_Sw<{R9^}YfhQdh&wrQCyY zR5FJsXjxko5rV5C>t0;=ub0GF%lvVMX4xkeeeyqz8hJ)xaY?#aj>wDJH%6hg?+OZy z+VoP@=@OQQ*vmCns~_i|)+iHaC5}e1Q}J|o4fqD9W9e(3j#0k7N&)w(6Dq1eU84qd zl`7OV>QGBmqVfVOYcadQ2P}%$rn)x93S0LT!o}M5-#$Bv%gGIHU}g2jSRwkZ#tNgf zd}VQY$?|f8+SN >ew^5XIhYoo2ZT?E$p z@pUT5e=B-fTnTRKB`-1DFkful?R+r`!);7qSqeF9ezfu9ZD(w6wmBQj;+d@)<2BCz zJvHV1|Ltkz^Z)m{nfrfR=Lgocf3g2>J^y#P%lm)Z(LDP~p8xx6yncWA`2XE+TOa+` zwSSM-U1N{`poiu7za4Gut8bgQ`z9{GnOl5YeeG@YSKh4mhQxpB&_8Be&BrD#H8IFK z`%6Sv8ejQ3-8*!hu6&j5H-43_^cA|d?Mwd}o3p`wDeKLTvHA6Xjn7q%|J&2b>;Lz< znd|?p^8@SJ|BCv5zsFNsUjJ`LD_#Fz Hofzu*0~_0fNQ`*(Tm_5U> Date: Tue, 10 Feb 2026 14:29:27 +0800 Subject: [PATCH 06/20] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86pytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../interface/cleaning_task_routes.py | 20 +++++---- .../interface/cleaning_template_routes.py | 10 ++--- .../cleaning/service/cleaning_task_service.py | 42 +++++++++++++++++++ .../operator/interface/category_routes.py | 2 +- .../operator/interface/operator_routes.py | 16 +++---- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index dae2a96c..dc233f77 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -53,6 +53,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: ) from app.module.cleaning.runtime_client import RuntimeClient from app.module.dataset.service import DatasetManagementService + from app.module.shared.common.lineage import LineageService runtime_client = RuntimeClient() scheduler = CleaningTaskScheduler( @@ -61,6 +62,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: ) operator_service = _get_operator_service() dataset_service = DatasetManagementService(db) + lineage_service = LineageService(db) return CleaningTaskService( task_repo=CleaningTaskRepository(None), @@ -70,6 +72,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: scheduler=scheduler, validator=CleanTaskValidator(), dataset_service=dataset_service, + lineage_service=lineage_service, ) @@ -95,7 +98,7 @@ async def get_cleaning_tasks( total_pages = (count + size - 1) // size if size > 0 else 0 return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( page=page, @@ -128,7 +131,7 @@ async def create_cleaning_task( await task_service.execute_task(db, task.id) await db.commit() - return StandardResponse(code=200, message="success", data=task) + return StandardResponse(code="0", message="success", data=task) except Exception as e: await db.rollback() logger.error(f"Failed to create cleaning task: {e}", exc_info=True) @@ -149,7 +152,7 @@ async def get_cleaning_task( try: task_service = _get_task_service(db) task = await task_service.get_task(db, task_id) - return StandardResponse(code=200, message="success", data=task) + return StandardResponse(code="0", message="success", data=task) except Exception as e: logger.error(f"Failed to get cleaning task {task_id}: {e}", exc_info=True) raise HTTPException(status_code=404, detail=str(e)) @@ -170,7 +173,7 @@ async def delete_cleaning_task( task_service = _get_task_service(db) await task_service.delete_task(db, task_id) await db.commit() - return StandardResponse(code=200, message="success", data=task_id) + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: await db.rollback() logger.error(f"Failed to delete cleaning task {task_id}: {e}", exc_info=True) @@ -191,7 +194,7 @@ async def stop_cleaning_task( try: task_service = _get_task_service(db) await task_service.stop_task(db, task_id) - return StandardResponse(code=200, message="success", data=task_id) + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: logger.error(f"Failed to stop cleaning task {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) @@ -211,7 +214,8 @@ async def execute_cleaning_task( try: task_service = _get_task_service(db) await task_service.execute_task(db, task_id) - return StandardResponse(code=200, message="success", data=task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: await db.rollback() logger.error(f"Failed to execute cleaning task {task_id}: {e}", exc_info=True) @@ -232,7 +236,7 @@ async def get_cleaning_task_results( try: task_service = _get_task_service(db) results = await task_service.get_task_results(db, task_id) - return StandardResponse(code=200, message="success", data=results) + return StandardResponse(code="0", message="success", data=results) except Exception as e: logger.error(f"Failed to get task results {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) @@ -253,7 +257,7 @@ async def get_cleaning_task_log( try: task_service = _get_task_service(db) logs = await task_service.get_task_log(db, task_id, retry_count) - return StandardResponse(code=200, message="success", data=logs) + return StandardResponse(code="0", message="success", data=logs) except Exception as e: logger.error(f"Failed to get task log {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py index 3da722e0..85abbb25 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -93,7 +93,7 @@ async def get_cleaning_templates( total_pages = math.ceil(total / size) if total > 0 else 0 return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( content=items, @@ -124,7 +124,7 @@ async def create_cleaning_template( template = await template_service.create_template(db, request) await db.commit() - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: await db.rollback() logger.error(f"Failed to create cleaning template: {e}", exc_info=True) @@ -146,7 +146,7 @@ async def get_cleaning_template( template_service = _get_template_service(db) template = await template_service.get_template(db, template_id) - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: logger.error(f"Failed to get cleaning template {template_id}: {e}", exc_info=True) raise HTTPException(status_code=404, detail=str(e)) @@ -169,7 +169,7 @@ async def update_cleaning_template( template = await template_service.update_template(db, template_id, request) await db.commit() - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: await db.rollback() logger.error(f"Failed to update cleaning template {template_id}: {e}", exc_info=True) @@ -191,7 +191,7 @@ async def delete_cleaning_template( template_service = _get_template_service(db) await template_service.delete_template(db, template_id) await db.commit() - return StandardResponse(code=200, message="success", data=template_id) + return StandardResponse(code="0", message="success", data=template_id) except Exception as e: await db.rollback() logger.error(f"Failed to delete cleaning template {template_id}: {e}", exc_info=True) diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py index 8350d179..12ab0c6a 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -31,6 +31,9 @@ CleaningTaskNotFoundError, FileSystemError, ) +from app.module.shared.schema.lineage import NodeType, EdgeType +from app.db.models.base_entity import LineageNode, LineageEdge +from app.module.shared.common.lineage import LineageService logger = get_logger(__name__) @@ -50,6 +53,7 @@ def __init__( scheduler: CleaningTaskScheduler, validator: CleanTaskValidator, dataset_service, + lineage_service: LineageService, ): self.task_repo = task_repo self.result_repo = result_repo @@ -58,6 +62,7 @@ def __init__( self.scheduler = scheduler self.validator = validator self.dataset_service = dataset_service + self.lineage_service = lineage_service async def get_tasks( self, @@ -153,6 +158,7 @@ async def create_task( logger.info(f"Successfully created dataset: {dest_dataset_id}") else: logger.info(f"Using existing dataset: {dest_dataset_id}") + dest_dataset_response = await self.dataset_service.get_dataset(dest_dataset_id) src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id) if not src_dataset: @@ -174,6 +180,8 @@ async def create_task( await self.task_repo.insert_task(db, task_dto) + await self._add_cleaning_to_graph(src_dataset, task_dto, dest_dataset_response) + await self.operator_instance_repo.insert_instance(db, task_id, request.instance) all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, is_star=None) @@ -183,6 +191,40 @@ async def create_task( return await self.get_task(db, task_id) + async def _add_cleaning_to_graph( + self, + src_dataset, + task: CleaningTaskDto, + dest_dataset, + ) -> None: + """ + 添加清洗任务到血缘图 + """ + from_node = LineageNode( + id=src_dataset.id, + node_type=NodeType.DATASET.value, + name=src_dataset.name, + description=src_dataset.description or "", + ) + + to_node = LineageNode( + id=dest_dataset.id, + node_type=NodeType.DATASET.value, + name=dest_dataset.name, + description=dest_dataset.description or "", + ) + + edge = LineageEdge( + process_id=task.id, + name=task.name or "", + description=task.description or "", + edge_type=EdgeType.DATA_CLEANING.value, + from_node_id=from_node.id, + to_node_id=to_node.id, + ) + + await self.lineage_service.generate_graph(from_node, edge, to_node) + async def prepare_task( self, dataset_id: str, diff --git a/runtime/datamate-python/app/module/operator/interface/category_routes.py b/runtime/datamate-python/app/module/operator/interface/category_routes.py index ed4207e0..f4be2b43 100644 --- a/runtime/datamate-python/app/module/operator/interface/category_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/category_routes.py @@ -40,4 +40,4 @@ async def get_category_tree( ): """获取分类树""" result = await service.get_all_categories(db) - return StandardResponse(code=200, message="success", data=result) + return StandardResponse(code="0", message="success", data=result) diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py index ee3b9c78..b5eb0a97 100644 --- a/runtime/datamate-python/app/module/operator/interface/operator_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -73,7 +73,7 @@ async def list_operators( total_pages = (count + request.size - 1) // request.size # Ceiling division return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( page=request.page, @@ -100,7 +100,7 @@ async def get_operator( try: operator = await service.get_operator_by_id(operator_id, db) operator.file_name = None # Don't return file_name - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) @@ -121,7 +121,7 @@ async def update_operator( try: operator = await service.update_operator(operator_id, request, db) await db.commit() - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: logger.error(f"{operator_id} {request}", e) await db.rollback() @@ -143,7 +143,7 @@ async def create_operator( try: operator = await service.create_operator(request, db) await db.commit() - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) @@ -166,7 +166,7 @@ async def upload_operator( if not file_name: raise HTTPException(status_code=422, detail="fileName is required") operator = await service.upload_operator(file_name, db) - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: logger.error(f"{file_name}", e) raise HTTPException(status_code=400, detail=str(e)) @@ -187,7 +187,7 @@ async def pre_upload( req_id = await service.pre_upload(db) await db.commit() return StandardResponse( - code=200, + code="0", message="success", data=req_id, ) @@ -227,7 +227,7 @@ async def chunk_upload( db=db ) await db.commit() - return StandardResponse(code=200, message="success", data=result.dict()) + return StandardResponse(code="0", message="success", data=result.dict()) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) @@ -248,7 +248,7 @@ async def delete_operator( try: await service.delete_operator(operator_id, db) await db.commit() - return StandardResponse(code=200, message="success", data=None) + return StandardResponse(code="0", message="success", data=None) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) From d080d1af43895028199ad3c596988fda6499b670 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 10 Feb 2026 15:14:11 +0800 Subject: [PATCH 07/20] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?= =?UTF-8?q?=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/public/config/error-code.json | 31 ++-- .../OperatorMarket/Home/OperatorMarket.tsx | 12 +- .../app/core/exception/codes.py | 20 +++ .../app/core/exception/middleware.py | 2 +- .../app/module/cleaning/__init__.py | 21 --- .../interface/cleaning_task_routes.py | 91 ++++------- .../interface/cleaning_template_routes.py | 126 +++++++--------- .../repository/cleaning_task_repository.py | 8 +- .../cleaning_template_repository.py | 8 +- .../cleaning/service/clean_task_validator.py | 41 ++++- .../cleaning/service/cleaning_task_service.py | 36 ++--- .../service/cleaning_template_service.py | 18 ++- .../operator/interface/operator_routes.py | 142 +++++++----------- .../operator/service/operator_service.py | 14 +- 14 files changed, 264 insertions(+), 306 deletions(-) diff --git a/frontend/public/config/error-code.json b/frontend/public/config/error-code.json index 17137f5e..8270f4db 100644 --- a/frontend/public/config/error-code.json +++ b/frontend/public/config/error-code.json @@ -1,20 +1,25 @@ { + "0": "成功", + "cleaning.0001": "清洗任务不存在", + "cleaning.0002": "清洗任务名称重复", + "cleaning.0003": "清洗模板不存在", + "cleaning.0004": "清洗模板名称重复", + "cleaning.0005": "算子输入输出类型不匹配", + "cleaning.0006": "执行器类型无效", + "cleaning.0007": "数据集不存在", + "cleaning.0008": "文件系统错误", + "cleaning.0009": "设置解析错误", + "cleaning.0010": "任务ID不能为空", + "operator.0001": "算子不存在", + "operator.0002": "算子正在使用中", + "operator.0003": "无法删除预置算子", + "operator.0004": "不支持的文件类型", + "operator.0005": "解析算子包失败", + "operator.0006": "缺少必要的字段", "400": "请求参数错误", "401": "登录已过期,请重新登录", "403": "没有权限访问该资源", "404": "请求的资源不存在", "500": "服务器内部错误,请稍后重试", - "502": "网关错误", - "op.0001": "不支持的文件类型", - "op.0002": "算子中缺少元数据文件", - "op.0003": "缺少必要的字段", - "op.0004": "settings字段解析失败", - "op.0005": "算子ID已存在", - "op.0006": "算子名称已存在", - "op.0007": "算子已被编排在模板或未完成的任务中", - "op.0008": "预置算子无法删除", - "clean.0001": "清洗任务名称重复", - "clean.0002": "任务列表为空", - "clean.0003": "算子输入输出不匹配", - "clean.0004": "算子执行器不匹配" + "502": "网关错误" } \ No newline at end of file diff --git a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx index 352ec0e2..ecbc33be 100644 --- a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx +++ b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx @@ -74,14 +74,10 @@ export default function OperatorMarketPage() { }; const handleDeleteOperator = async (operator: OperatorI) => { - try { - await deleteOperatorByIdUsingDelete(operator.id); - message.success("算子删除成功"); - fetchData(); - await initCategoriesTree(); - } catch (error) { - message.error("算子删除失败"); - } + await deleteOperatorByIdUsingDelete(operator.id); + message.success("算子删除成功"); + fetchData(); + await initCategoriesTree(); }; const handleStar = async (operator: OperatorI) => { diff --git a/runtime/datamate-python/app/core/exception/codes.py b/runtime/datamate-python/app/core/exception/codes.py index d741174b..294e6d56 100644 --- a/runtime/datamate-python/app/core/exception/codes.py +++ b/runtime/datamate-python/app/core/exception/codes.py @@ -86,6 +86,26 @@ def __init__(self): RATIO_ALREADY_EXISTS: Final = ErrorCode("ratio.0003", "Task already exists", 400) RATIO_DELETE_FAILED: Final = ErrorCode("ratio.0004", "Failed to delete task", 500) + # ========== 清洗模块 ========== + CLEANING_TASK_NOT_FOUND: Final = ErrorCode("cleaning.0001", "Cleaning task not found", 404) + CLEANING_NAME_DUPLICATED: Final = ErrorCode("cleaning.0002", "Cleaning task name is duplicated", 400) + CLEANING_TEMPLATE_NOT_FOUND: Final = ErrorCode("cleaning.0003", "Cleaning template not found", 404) + CLEANING_TEMPLATE_NAME_DUPLICATED: Final = ErrorCode("cleaning.0004", "Cleaning template name is duplicated", 400) + CLEANING_INVALID_OPERATOR_INPUT: Final = ErrorCode("cleaning.0005", "Invalid operator input/output types", 400) + CLEANING_INVALID_EXECUTOR_TYPE: Final = ErrorCode("cleaning.0006", "Invalid executor type", 400) + CLEANING_DATASET_NOT_FOUND: Final = ErrorCode("cleaning.0007", "Dataset not found", 404) + CLEANING_FILE_SYSTEM_ERROR: Final = ErrorCode("cleaning.0008", "File system error", 500) + CLEANING_SETTINGS_PARSE_ERROR: Final = ErrorCode("cleaning.0009", "Settings parse error", 400) + CLEANING_TASK_ID_REQUIRED: Final = ErrorCode("cleaning.0010", "Task ID is required", 400) + + # ========== 算子市场模块 ========== + OPERATOR_NOT_FOUND: Final = ErrorCode("operator.0001", "Operator not found", 404) + OPERATOR_IN_INSTANCE: Final = ErrorCode("operator.0002", "Operator is in use", 400) + OPERATOR_CANNOT_DELETE_PREDEFINED: Final = ErrorCode("operator.0003", "Cannot delete predefined operator", 400) + OPERATOR_UNSUPPORTED_FILE_TYPE: Final = ErrorCode("operator.0004", "Unsupported file type", 400) + OPERATOR_PARSE_FAILED: Final = ErrorCode("operator.0005", "Failed to parse operator package", 400) + OPERATOR_FIELD_NOT_FOUND: Final = ErrorCode("operator.0006", "Required field is missing", 400) + # ========== 系统模块 ========== SYSTEM_MODEL_NOT_FOUND: Final = ErrorCode("system.0006", "Model configuration not found", 404) SYSTEM_MODEL_HEALTH_CHECK_FAILED: Final = ErrorCode("system.0007", "Model health check failed", 500) diff --git a/runtime/datamate-python/app/core/exception/middleware.py b/runtime/datamate-python/app/core/exception/middleware.py index 82b03ca2..561d130d 100644 --- a/runtime/datamate-python/app/core/exception/middleware.py +++ b/runtime/datamate-python/app/core/exception/middleware.py @@ -69,7 +69,7 @@ async def dispatch(self, request: Request, call_next): except Exception as exc: # 捕获所有未处理的异常 logger.error( - f"Unhandled exception occurred at {request.method} {request.url.path}", + f"Unhandled exception occurred at {request.method} {request.url.path}", exc, exc_info=True ) return self._error_response( diff --git a/runtime/datamate-python/app/module/cleaning/__init__.py b/runtime/datamate-python/app/module/cleaning/__init__.py index 0d35bbc7..da6c0f3a 100644 --- a/runtime/datamate-python/app/module/cleaning/__init__.py +++ b/runtime/datamate-python/app/module/cleaning/__init__.py @@ -11,18 +11,6 @@ UpdateCleaningTemplateRequest, ) -from .exceptions import ( - CleaningException, - CleaningNameDuplicationError, - CleaningTaskNotFoundError, - CleaningTemplateNotFoundError, - InvalidOperatorInputError, - ExecutorTypeError, - DatasetNotFoundError, - FileSystemError, - SettingsParseError, -) - from .repository import ( CleaningTaskRepository, CleaningTemplateRepository, @@ -50,15 +38,6 @@ "CleaningTemplateDto", "CreateCleaningTemplateRequest", "UpdateCleaningTemplateRequest", - "CleaningException", - "CleaningNameDuplicationError", - "CleaningTaskNotFoundError", - "CleaningTemplateNotFoundError", - "InvalidOperatorInputError", - "ExecutorTypeError", - "DatasetNotFoundError", - "FileSystemError", - "SettingsParseError", "CleaningTaskRepository", "CleaningTemplateRepository", "CleaningResultRepository", diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index dc233f77..82cc24af 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -1,6 +1,6 @@ from typing import Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger @@ -64,13 +64,15 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: dataset_service = DatasetManagementService(db) lineage_service = LineageService(db) + task_repo = CleaningTaskRepository(None) + return CleaningTaskService( - task_repo=CleaningTaskRepository(None), + task_repo=task_repo, result_repo=CleaningResultRepository(None), operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, scheduler=scheduler, - validator=CleanTaskValidator(), + validator=CleanTaskValidator(task_repo=task_repo, template_repo=None), dataset_service=dataset_service, lineage_service=lineage_service, ) @@ -122,20 +124,15 @@ async def create_cleaning_task( db: AsyncSession = Depends(get_db), ): """Create cleaning task""" - try: - task_service = _get_task_service(db) + task_service = _get_task_service(db) - task = await task_service.create_task(db, request) - await db.commit() + task = await task_service.create_task(db, request) + await db.commit() - await task_service.execute_task(db, task.id) - await db.commit() + await task_service.execute_task(db, task.id) + await db.commit() - return StandardResponse(code="0", message="success", data=task) - except Exception as e: - await db.rollback() - logger.error(f"Failed to create cleaning task: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + return StandardResponse(code="0", message="success", data=task) @router.get( @@ -149,13 +146,9 @@ async def get_cleaning_task( db: AsyncSession = Depends(get_db), ): """Get cleaning task by ID""" - try: - task_service = _get_task_service(db) - task = await task_service.get_task(db, task_id) - return StandardResponse(code="0", message="success", data=task) - except Exception as e: - logger.error(f"Failed to get cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=404, detail=str(e)) + task_service = _get_task_service(db) + task = await task_service.get_task(db, task_id) + return StandardResponse(code="0", message="success", data=task) @router.delete( @@ -169,15 +162,10 @@ async def delete_cleaning_task( db: AsyncSession = Depends(get_db), ): """Delete cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.delete_task(db, task_id) - await db.commit() - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to delete cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.delete_task(db, task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) @router.post( @@ -191,13 +179,9 @@ async def stop_cleaning_task( db: AsyncSession = Depends(get_db), ): """Stop cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.stop_task(db, task_id) - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - logger.error(f"Failed to stop cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.stop_task(db, task_id) + return StandardResponse(code="0", message="success", data=task_id) @router.post( @@ -211,15 +195,10 @@ async def execute_cleaning_task( db: AsyncSession = Depends(get_db), ): """Execute cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.execute_task(db, task_id) - await db.commit() - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to execute cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.execute_task(db, task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) @router.get( @@ -233,13 +212,9 @@ async def get_cleaning_task_results( db: AsyncSession = Depends(get_db), ): """Get cleaning task results""" - try: - task_service = _get_task_service(db) - results = await task_service.get_task_results(db, task_id) - return StandardResponse(code="0", message="success", data=results) - except Exception as e: - logger.error(f"Failed to get task results {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + results = await task_service.get_task_results(db, task_id) + return StandardResponse(code="0", message="success", data=results) @router.get( @@ -254,10 +229,6 @@ async def get_cleaning_task_log( db: AsyncSession = Depends(get_db), ): """Get cleaning task log""" - try: - task_service = _get_task_service(db) - logs = await task_service.get_task_log(db, task_id, retry_count) - return StandardResponse(code="0", message="success", data=logs) - except Exception as e: - logger.error(f"Failed to get task log {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + logs = await task_service.get_task_log(db, task_id, retry_count) + return StandardResponse(code="0", message="success", data=logs) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py index 85abbb25..9c641e62 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -1,7 +1,9 @@ import math from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, Query + +from fastapi import APIRouter, Depends, Query from sqlalchemy import select, func +from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger from app.db.session import get_db @@ -9,11 +11,9 @@ CleaningTemplateDto, CreateCleaningTemplateRequest, UpdateCleaningTemplateRequest, - OperatorInstanceDto, ) from app.module.cleaning.service import CleaningTemplateService from app.module.shared.schema import StandardResponse, PaginatedData -from sqlalchemy.ext.asyncio import AsyncSession logger = get_logger(__name__) @@ -48,15 +48,16 @@ def _get_template_service(db: AsyncSession) -> CleaningTemplateService: CleaningTemplateRepository, OperatorInstanceRepository, ) - from app.db.models.cleaning import CleaningTemplate, OperatorInstance operator_service = _get_operator_service() + template_repo = CleaningTemplateRepository(None) + return CleaningTemplateService( - template_repo=CleaningTemplateRepository(None), + template_repo=template_repo, operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, - validator=CleanTaskValidator(), + validator=CleanTaskValidator(task_repo=None, template_repo=template_repo), ) @@ -73,39 +74,36 @@ async def get_cleaning_templates( db: AsyncSession = Depends(get_db), ): """Query cleaning templates with pagination""" - try: - from app.db.models.cleaning import CleaningTemplate - - template_service = _get_template_service(db) - - query = select(CleaningTemplate) - - if keyword: - keyword_pattern = f"%{keyword}%" - query = query.where( - CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern) - ) - - count_query = select(func.count()).select_from(query.subquery()) - total = (await db.execute(count_query)).scalar_one() - items = await template_service.get_templates(db, keyword) - - total_pages = math.ceil(total / size) if total > 0 else 0 - - return StandardResponse( - code="0", - message="success", - data=PaginatedData( - content=items, - total_elements=total, - total_pages=total_pages, - page=page, - size=size, - ) + from app.db.models.cleaning import CleaningTemplate + + template_service = _get_template_service(db) + + query = select(CleaningTemplate) + + if keyword: + keyword_pattern = f"%{keyword}%" + query = query.where( + CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern) + ) + + count_query = select(func.count()).select_from(query.subquery()) + total = (await db.execute(count_query)).scalar_one() + + items = await template_service.get_templates(db, keyword) + + total_pages = math.ceil(total / size) if total > 0 else 0 + + return StandardResponse( + code="0", + message="success", + data=PaginatedData( + content=items, + total_elements=total, + total_pages=total_pages, + page=page, + size=size, ) - except Exception as e: - logger.error(f"Failed to get cleaning templates: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + ) @router.post( @@ -119,16 +117,12 @@ async def create_cleaning_template( db: AsyncSession = Depends(get_db), ): """Create cleaning template""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) + + template = await template_service.create_template(db, request) + await db.commit() - template = await template_service.create_template(db, request) - await db.commit() - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - await db.rollback() - logger.error(f"Failed to create cleaning template: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + return StandardResponse(code="0", message="success", data=template) @router.get( @@ -142,14 +136,10 @@ async def get_cleaning_template( db: AsyncSession = Depends(get_db), ): """Get cleaning template by ID""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) - template = await template_service.get_template(db, template_id) - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - logger.error(f"Failed to get cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=404, detail=str(e)) + template = await template_service.get_template(db, template_id) + return StandardResponse(code="0", message="success", data=template) @router.put( @@ -164,16 +154,12 @@ async def update_cleaning_template( db: AsyncSession = Depends(get_db), ): """Update cleaning template""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) - template = await template_service.update_template(db, template_id, request) - await db.commit() - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - await db.rollback() - logger.error(f"Failed to update cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + template = await template_service.update_template(db, template_id, request) + await db.commit() + + return StandardResponse(code="0", message="success", data=template) @router.delete( @@ -187,12 +173,8 @@ async def delete_cleaning_template( db: AsyncSession = Depends(get_db), ): """Delete cleaning template""" - try: - template_service = _get_template_service(db) - await template_service.delete_template(db, template_id) - await db.commit() - return StandardResponse(code="0", message="success", data=template_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to delete cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + template_service = _get_template_service(db) + await template_service.delete_template(db, template_id) + await db.commit() + + return StandardResponse(code="0", message="success", data=template_id) diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py index 56b1bfd0..7c83d9a2 100644 --- a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py @@ -1,6 +1,6 @@ from typing import List, Optional from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy import select, delete +from sqlalchemy import select, delete, func from app.db.models.cleaning import CleaningTask from app.module.cleaning.schema import CleaningTaskDto @@ -132,3 +132,9 @@ async def delete_task_by_id(self, db: AsyncSession, task_id: str) -> None: query = delete(self.model).where(self.model.id == task_id) await db.execute(query) await db.flush() + + async def is_name_exist(self, db: AsyncSession, name: str) -> bool: + """Check if task name exists""" + query = select(func.count()).select_from(self.model).where(self.model.name == name) + result = await db.execute(query) + return result.scalar_one() > 0 if result else False diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py index b2aab16e..aa35ba71 100644 --- a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py @@ -1,6 +1,6 @@ from typing import List, Optional from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy import select, delete +from sqlalchemy import select, delete, func from app.db.models.cleaning import CleaningTemplate @@ -55,3 +55,9 @@ async def delete_template(self, db: AsyncSession, template_id: str) -> None: query = delete(self.model).where(self.model.id == template_id) await db.execute(query) await db.flush() + + async def is_name_exist(self, db: AsyncSession, name: str) -> bool: + """Check if template name exists""" + query = select(func.count()).select_from(self.model).where(self.model.name == name) + result = await db.execute(query) + return result.scalar_one() > 0 if result else False diff --git a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py index 4ea94464..0c8de701 100644 --- a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py +++ b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py @@ -1,11 +1,30 @@ -import re +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.exception import BusinessError, ErrorCodes from app.module.cleaning.schema import OperatorInstanceDto -from app.module.cleaning.exceptions import InvalidOperatorInputError, ExecutorTypeError class CleanTaskValidator: """Validator for cleaning tasks and templates""" + def __init__(self, task_repo=None, template_repo=None): + self.task_repo = task_repo + self.template_repo = template_repo + + async def check_task_name_duplication(self, db: AsyncSession, name: str) -> None: + """Check if task name is duplicated""" + if not name: + raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED) + if await self.task_repo.is_name_exist(db, name): + raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED) + + async def check_template_name_duplication(self, db: AsyncSession, name: str) -> None: + """Check if template name is duplicated""" + if not name: + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED) + if await self.template_repo.is_name_exist(db, name): + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED) + @staticmethod def check_input_and_output(instances: list[OperatorInstanceDto]) -> None: """Validate that operator input/output types are compatible""" @@ -17,16 +36,23 @@ def check_input_and_output(instances: list[OperatorInstanceDto]) -> None: next_op = instances[i + 1] if not current.outputs: - raise InvalidOperatorInputError(f"Operator {current.id} has no outputs defined") + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, + f"Operator {current.id} has no outputs defined" + ) if not next_op.inputs: - raise InvalidOperatorInputError(f"Operator {next_op.id} has no inputs defined") + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, + f"Operator {next_op.id} has no inputs defined" + ) current_outputs = set(current.outputs.split(',')) next_inputs = set(next_op.inputs.split(',')) if not current_outputs.intersection(next_inputs): - raise InvalidOperatorInputError( + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, f"Operator {current.id} outputs {current.outputs} " f"but operator {next_op.id} requires {next_op.inputs}" ) @@ -48,7 +74,8 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: executor_types.add("datamate") if len(executor_types) > 1: - raise ExecutorTypeError( + raise BusinessError( + ErrorCodes.CLEANING_INVALID_EXECUTOR_TYPE, "Cannot mix DataMate and DataJuicer operators in same task" ) @@ -58,4 +85,4 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: def check_task_id(task_id: str) -> None: """Validate task ID""" if not task_id: - raise ValueError("Task ID cannot be empty") + raise BusinessError(ErrorCodes.CLEANING_TASK_ID_REQUIRED) diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py index 12ab0c6a..5e25ed24 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -1,16 +1,21 @@ import json -import os -import uuid import re import shutil +import uuid from pathlib import Path from typing import List, Dict, Any, Set -from datetime import datetime -from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger +from app.db.models.base_entity import LineageNode, LineageEdge +from app.core.exception import BusinessError, ErrorCodes +from app.module.cleaning.repository import ( + CleaningTaskRepository, + CleaningResultRepository, + OperatorInstanceRepository, +) from app.module.cleaning.schema import ( CleaningTaskDto, CreateCleaningTaskRequest, @@ -20,20 +25,10 @@ CleaningProcess, CleaningTaskStatus, ) -from app.module.cleaning.repository import ( - CleaningTaskRepository, - CleaningResultRepository, - OperatorInstanceRepository, -) -from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler from app.module.cleaning.service.clean_task_validator import CleanTaskValidator -from app.module.cleaning.exceptions import ( - CleaningTaskNotFoundError, - FileSystemError, -) -from app.module.shared.schema.lineage import NodeType, EdgeType -from app.db.models.base_entity import LineageNode, LineageEdge +from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler from app.module.shared.common.lineage import LineageService +from app.module.shared.schema.lineage import NodeType, EdgeType logger = get_logger(__name__) @@ -99,7 +94,7 @@ async def get_task(self, db: AsyncSession, task_id: str) -> CleaningTaskDto: """Get task by ID""" task = await self.task_repo.find_task_by_id(db, task_id) if not task: - raise CleaningTaskNotFoundError(task_id) + raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id) await self._set_process(db, task) @@ -138,6 +133,7 @@ async def create_task( instances = await self.get_instance_by_template_id(db, request.template_id) request.instance = instances + await self.validator.check_task_name_duplication(db, request.name) self.validator.check_input_and_output(request.instance) executor_type = self.validator.check_and_get_executor_type(request.instance) @@ -162,7 +158,7 @@ async def create_task( src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id) if not src_dataset: - raise Exception(f"Source dataset not found: {request.src_dataset_id}") + raise BusinessError(ErrorCodes.CLEANING_DATASET_NOT_FOUND, request.src_dataset_id) task_dto = CleaningTaskDto( id=task_id, @@ -265,7 +261,7 @@ async def prepare_task( yaml.dump(process_config, f, default_flow_style=False, allow_unicode=True) except Exception as e: logger.error(f"Failed to write process.yaml: {e}") - raise FileSystemError(f"Failed to write process.yaml: {e}") + raise BusinessError(ErrorCodes.CLEANING_FILE_SYSTEM_ERROR, str(e)) def _get_default_values(self, operator) -> Dict[str, Any]: """Get default values from operator settings""" @@ -403,7 +399,7 @@ async def execute_task(self, db: AsyncSession, task_id: str) -> bool: task = await self.task_repo.find_task_by_id(db, task_id) if not task: - raise CleaningTaskNotFoundError(task_id) + raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id) await self.scan_dataset(db, task_id, task.src_dataset_id, succeed_set) await self.result_repo.delete_by_instance_id(db, task_id, "FAILED") diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py index 8087a36e..eea48fb5 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py @@ -4,16 +4,18 @@ from sqlalchemy.ext.asyncio import AsyncSession +from app.core.exception import BusinessError, ErrorCodes from app.core.logging import get_logger +from app.module.cleaning import UpdateCleaningTemplateRequest +from app.module.cleaning.repository import ( + CleaningTemplateRepository, + OperatorInstanceRepository, +) from app.module.cleaning.schema import ( CleaningTemplateDto, CreateCleaningTemplateRequest, OperatorInstanceDto, ) -from app.module.cleaning.repository import ( - CleaningTemplateRepository, - OperatorInstanceRepository, -) from app.module.cleaning.service.clean_task_validator import CleanTaskValidator logger = get_logger(__name__) @@ -97,7 +99,7 @@ async def get_template( """Get template by ID""" template = await self.template_repo.find_template_by_id(db, template_id) if not template: - raise ValueError(f"Template {template_id} not found") + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id) template_dto = CleaningTemplateDto( id=template.id, @@ -144,6 +146,7 @@ async def create_template( """Create new template""" from app.db.models.cleaning import CleaningTemplate + await self.validator.check_template_name_duplication(db, request.name) self.validator.check_input_and_output(request.instance) self.validator.check_and_get_executor_type(request.instance) @@ -164,14 +167,13 @@ async def update_template( self, db: AsyncSession, template_id: str, - request: CreateCleaningTemplateRequest + request: UpdateCleaningTemplateRequest ) -> CleaningTemplateDto: """Update template""" - from app.db.models.cleaning import CleaningTemplate template = await self.template_repo.find_template_by_id(db, template_id) if not template: - raise ValueError(f"Template {template_id} not found") + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id) template.name = request.name template.description = request.description diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py index b5eb0a97..4ae78f3a 100644 --- a/runtime/datamate-python/app/module/operator/interface/operator_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -4,7 +4,7 @@ """ from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form, File, Body +from fastapi import APIRouter, Depends, UploadFile, Form, File, Body from fastapi.responses import FileResponse from app.core.logging import get_logger @@ -30,6 +30,7 @@ router = APIRouter(prefix="/operators", tags=["Operator"]) + def get_operator_service() -> OperatorService: """获取算子服务实例""" return OperatorService( @@ -51,7 +52,7 @@ def get_operator_service() -> OperatorService: async def list_operators( request: OperatorListRequest, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """查询算子列表""" operators = await service.get_operators( @@ -70,7 +71,7 @@ async def list_operators( db=db ) - total_pages = (count + request.size - 1) // request.size # Ceiling division + total_pages = (count + request.size - 1) // request.size return StandardResponse( code="0", @@ -94,15 +95,12 @@ async def list_operators( async def get_operator( operator_id: str, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """获取算子详情""" - try: - operator = await service.get_operator_by_id(operator_id, db) - operator.file_name = None # Don't return file_name - return StandardResponse(code="0", message="success", data=operator) - except ValueError as e: - raise HTTPException(status_code=404, detail=str(e)) + operator = await service.get_operator_by_id(operator_id, db) + operator.file_name = None + return StandardResponse(code="0", message="success", data=operator) @router.put( @@ -115,17 +113,12 @@ async def update_operator( operator_id: str, request: OperatorUpdateDto, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """更新算子""" - try: - operator = await service.update_operator(operator_id, request, db) - await db.commit() - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - logger.error(f"{operator_id} {request}", e) - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + operator = await service.update_operator(operator_id, request, db) + await db.commit() + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -137,16 +130,12 @@ async def update_operator( async def create_operator( request: OperatorDto, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """创建算子""" - try: - operator = await service.create_operator(request, db) - await db.commit() - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + operator = await service.create_operator(request, db) + await db.commit() + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -158,18 +147,15 @@ async def create_operator( async def upload_operator( request: dict = Body(...), service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """上传算子""" - try: - file_name = request.get("fileName") - if not file_name: - raise HTTPException(status_code=422, detail="fileName is required") - operator = await service.upload_operator(file_name, db) - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - logger.error(f"{file_name}", e) - raise HTTPException(status_code=400, detail=str(e)) + file_name = request.get("fileName") + if not file_name: + from fastapi import HTTPException + raise HTTPException(status_code=422, detail="fileName is required") + operator = await service.upload_operator(file_name, db) + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -180,20 +166,16 @@ async def upload_operator( ) async def pre_upload( service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """预上传""" - try: - req_id = await service.pre_upload(db) - await db.commit() - return StandardResponse( - code="0", - message="success", - data=req_id, - ) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + req_id = await service.pre_upload(db) + await db.commit() + return StandardResponse( + code="0", + message="success", + data=req_id, + ) @router.post( @@ -211,26 +193,22 @@ async def chunk_upload( file: UploadFile = File(...), check_sum_hex: Optional[str] = Form(None, alias="checkSumHex", description="校验和"), service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """分块上传""" - try: - file_content = await file.read() - result = await service.chunk_upload( - req_id=req_id, - file_no=file_no, - file_name=file_name, - total_chunk_num=total_chunk_num, - chunk_no=chunk_no, - check_sum_hex=check_sum_hex, - file_content=file_content, - db=db - ) - await db.commit() - return StandardResponse(code="0", message="success", data=result.dict()) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + file_content = await file.read() + result = await service.chunk_upload( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + file_content=file_content, + db=db + ) + await db.commit() + return StandardResponse(code="0", message="success", data=result.dict()) @router.delete( @@ -242,16 +220,12 @@ async def chunk_upload( async def delete_operator( operator_id: str, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """删除算子""" - try: - await service.delete_operator(operator_id, db) - await db.commit() - return StandardResponse(code="0", message="success", data=None) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + await service.delete_operator(operator_id, db) + await db.commit() + return StandardResponse(code="0", message="success", data=None) @router.get( @@ -261,17 +235,15 @@ async def delete_operator( description="下载示例算子文件" ) async def download_example_operator( - service: OperatorService = Depends(get_operator_service) + service: OperatorService = Depends(get_operator_service), ): """下载示例算子""" from app.module.operator.constants import EXAMPLE_OPERATOR_PATH + example_path = EXAMPLE_OPERATOR_PATH - try: - file_path = service.download_example_operator(example_path) - return FileResponse( - path=str(file_path), - filename=file_path.name, - media_type="application/octet-stream" - ) - except FileNotFoundError: - raise HTTPException(status_code=404, detail="Example file not found") + file_path = service.download_example_operator(example_path) + return FileResponse( + path=str(file_path), + filename=file_path.name, + media_type="application/octet-stream" + ) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 43594e94..bccab373 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -13,6 +13,7 @@ from sqlalchemy import select, text, func from app.core.logging import get_logger +from app.core.exception import BusinessError, ErrorCodes from app.module.operator.repository import ( OperatorRepository, CategoryRelationRepository, @@ -31,11 +32,6 @@ YAML_PATH, SERVICE_ID, ) -from app.module.operator.exceptions import ( - SettingsParseError, - OperatorInInstanceError, - CannotDeletePredefinedOperatorError, -) from app.module.shared.file_service import FileService from app.module.shared.file_models import ( ChunkUploadRequestDto, @@ -254,7 +250,7 @@ async def get_operator_by_id( row = result.fetchone() if not row: - raise ValueError(f"Operator {operator_id} not found") + raise BusinessError(ErrorCodes.OPERATOR_NOT_FOUND, operator_id) # Parse categories from comma-separated string categories_str = row.categories if hasattr(row, 'categories') and row.categories else "" @@ -437,14 +433,14 @@ async def delete_operator( in_template = await self.operator_repo.operator_in_template(operator_id, db) in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db) if in_template or in_unstop_task: - raise OperatorInInstanceError() + raise BusinessError(ErrorCodes.OPERATOR_IN_INSTANCE) # Check if operator is predefined is_predefined = await self.category_relation_repo.operator_is_predefined( operator_id, db ) if is_predefined: - raise CannotDeletePredefinedOperatorError() + raise BusinessError(ErrorCodes.OPERATOR_CANNOT_DELETE_PREDEFINED) # Get operator for file cleanup operator = await self.get_operator_by_id(operator_id, db) @@ -550,7 +546,7 @@ def _override_settings(self, operator: OperatorDto) -> None: operator.settings = json.dumps(settings) except json.JSONDecodeError as e: - raise SettingsParseError(str(e)) + raise BusinessError(ErrorCodes.OPERATOR_PARSE_FAILED, str(e)) def _convert_to_list_string(self, value: Any) -> str: """转换为逗号分隔的字符串""" From 5b10d432c2aac00eb38a232dd6a95a59ffba3e03 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Sat, 31 Jan 2026 17:09:11 +0800 Subject: [PATCH 08/20] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E7=AE=97=E5=AD=90=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../infrastructure/persistence/Impl/OperatorRepositoryImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java index 7b43869b..a36b2649 100644 --- a/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java +++ b/backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java @@ -48,7 +48,7 @@ public int countOperatorByStar(boolean isStar) { @Override public boolean operatorInTemplateOrRunning(String operatorId) { - return mapper.operatorInTemplate(operatorId) > 0 && mapper.operatorInUnstopTask(operatorId) > 0; + return mapper.operatorInTemplate(operatorId) > 0 || mapper.operatorInUnstopTask(operatorId) > 0; } @Override From b597f497e77eb50f03ceb026f934a6b89aea9e3c Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Sat, 31 Jan 2026 18:04:29 +0800 Subject: [PATCH 09/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gateway/ApiGatewayApplication.java | 4 + .../datamate-python/app/db/models/__init__.py | 16 + .../app/db/models/chunk_upload.py | 38 ++ .../datamate-python/app/db/models/operator.py | 70 ++ .../datamate-python/app/module/__init__.py | 4 + .../app/module/operator/README.md | 138 ++++ .../app/module/operator/__init__.py | 4 + .../app/module/operator/constants.py | 50 ++ .../app/module/operator/exceptions.py | 72 +++ .../app/module/operator/interface/__init__.py | 9 + .../operator/interface/category_routes.py | 43 ++ .../operator/interface/operator_routes.py | 270 ++++++++ .../app/module/operator/parsers/__init__.py | 15 + .../operator/parsers/abstract_parser.py | 97 +++ .../module/operator/parsers/parser_holder.py | 52 ++ .../app/module/operator/parsers/tar_parser.py | 41 ++ .../app/module/operator/parsers/zip_parser.py | 41 ++ .../module/operator/repository/__init__.py | 15 + .../category_relation_repository.py | 77 +++ .../repository/category_repository.py | 23 + .../repository/operator_release_repository.py | 72 +++ .../repository/operator_repository.py | 121 ++++ .../app/module/operator/schema/__init__.py | 29 + .../app/module/operator/schema/category.py | 40 ++ .../app/module/operator/schema/operator.py | 72 +++ .../app/module/operator/schema/release.py | 22 + .../app/module/operator/service/__init__.py | 11 + .../operator/service/category_service.py | 101 +++ .../operator/service/operator_service.py | 599 ++++++++++++++++++ .../app/module/shared/__init__.py | 21 + .../module/shared/chunk_upload_repository.py | 95 +++ .../app/module/shared/chunks_saver.py | 146 +++++ .../app/module/shared/file_models.py | 38 ++ .../app/module/shared/file_service.py | 187 ++++++ scripts/db/data-operator-init.sql | 13 +- scripts/images/backend-python/Dockerfile | 1 + 36 files changed, 2646 insertions(+), 1 deletion(-) create mode 100644 runtime/datamate-python/app/db/models/chunk_upload.py create mode 100644 runtime/datamate-python/app/db/models/operator.py create mode 100644 runtime/datamate-python/app/module/operator/README.md create mode 100644 runtime/datamate-python/app/module/operator/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/constants.py create mode 100644 runtime/datamate-python/app/module/operator/exceptions.py create mode 100644 runtime/datamate-python/app/module/operator/interface/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/interface/category_routes.py create mode 100644 runtime/datamate-python/app/module/operator/interface/operator_routes.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/abstract_parser.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/parser_holder.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/tar_parser.py create mode 100644 runtime/datamate-python/app/module/operator/parsers/zip_parser.py create mode 100644 runtime/datamate-python/app/module/operator/repository/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/repository/category_relation_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/category_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/operator_release_repository.py create mode 100644 runtime/datamate-python/app/module/operator/repository/operator_repository.py create mode 100644 runtime/datamate-python/app/module/operator/schema/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/schema/category.py create mode 100644 runtime/datamate-python/app/module/operator/schema/operator.py create mode 100644 runtime/datamate-python/app/module/operator/schema/release.py create mode 100644 runtime/datamate-python/app/module/operator/service/__init__.py create mode 100644 runtime/datamate-python/app/module/operator/service/category_service.py create mode 100644 runtime/datamate-python/app/module/operator/service/operator_service.py create mode 100644 runtime/datamate-python/app/module/shared/chunk_upload_repository.py create mode 100644 runtime/datamate-python/app/module/shared/chunks_saver.py create mode 100644 runtime/datamate-python/app/module/shared/file_models.py create mode 100644 runtime/datamate-python/app/module/shared/file_service.py diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java index ee504973..de9e1f28 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java @@ -45,6 +45,10 @@ public RouteLocator customRouteLocator(RouteLocatorBuilder builder) { .route("python-service", r -> r.path("/api/rag/**", "api/models/**") .uri("http://datamate-backend-python:18000")) + // 数据评估服务路由 + .route("data-operator", r -> r.path("/api/operators/**", "api/categories/**") + .uri("http://datamate-backend-python:18000")) + .route("deer-flow-frontend", r -> r.path("/chat/**") .uri("http://deer-flow-frontend:3000")) diff --git a/runtime/datamate-python/app/db/models/__init__.py b/runtime/datamate-python/app/db/models/__init__.py index 2b83de26..060e4b64 100644 --- a/runtime/datamate-python/app/db/models/__init__.py +++ b/runtime/datamate-python/app/db/models/__init__.py @@ -21,6 +21,17 @@ EvaluationItem ) +from .operator import ( + Operator, + Category, + CategoryRelation, + OperatorRelease +) + +from .chunk_upload import ( + ChunkUploadPreRequest +) + __all__ = [ "Dataset", "DatasetTag", @@ -32,4 +43,9 @@ "LabelingProject", "EvaluationTask", "EvaluationItem", + "Operator", + "Category", + "CategoryRelation", + "OperatorRelease", + "ChunkUploadPreRequest", ] diff --git a/runtime/datamate-python/app/db/models/chunk_upload.py b/runtime/datamate-python/app/db/models/chunk_upload.py new file mode 100644 index 00000000..5b5a2b0c --- /dev/null +++ b/runtime/datamate-python/app/db/models/chunk_upload.py @@ -0,0 +1,38 @@ +""" +Chunk Upload Database Model +分片上传数据库模型 +""" +from sqlalchemy import Column, String, Integer, DateTime +from sqlalchemy.sql import func + +from app.db.models.base_entity import Base, BaseEntity + + +class ChunkUploadPreRequest(BaseEntity): + """分片上传预请求""" + __tablename__ = "t_chunk_upload_request" + + id = Column(String(36), primary_key=True, comment="请求ID") + total_file_num = Column(Integer, nullable=False, comment="总文件数") + uploaded_file_num = Column(Integer, nullable=True, comment="已上传文件数") + upload_path = Column(String(512), nullable=False, comment="文件路径") + timeout = Column(DateTime, nullable=False, comment="上传请求超时时间") + service_id = Column(String(64), nullable=True, comment="上传请求所属服务ID") + check_info = Column(String(512), nullable=True, comment="业务信息") + + def increment_uploaded_file_num(self): + """增加已上传文件数""" + if self.uploaded_file_num is None: + self.uploaded_file_num = 1 + else: + self.uploaded_file_num += 1 + + def is_upload_complete(self) -> bool: + """检查是否已完成上传""" + return (self.uploaded_file_num is not None and + self.uploaded_file_num == self.total_file_num) + + def is_request_timeout(self) -> bool: + """检查是否已超时""" + from datetime import datetime, timezone + return self.timeout is not None and datetime.now(timezone.utc) > self.timeout diff --git a/runtime/datamate-python/app/db/models/operator.py b/runtime/datamate-python/app/db/models/operator.py new file mode 100644 index 00000000..57362461 --- /dev/null +++ b/runtime/datamate-python/app/db/models/operator.py @@ -0,0 +1,70 @@ +""" +Operator Market Data Models +算子市场数据模型 +""" +from sqlalchemy import Column, String, Integer, Boolean, BigInteger, Text, JSON, TIMESTAMP, Index +from sqlalchemy.sql import func + +from app.db.models.base_entity import Base, BaseEntity + + +class Operator(BaseEntity): + """算子实体""" + __tablename__ = "t_operator" + + id = Column(String(36), primary_key=True, index=True, comment="算子ID") + name = Column(String(255), nullable=False, comment="算子名称") + description = Column(Text, nullable=True, comment="算子描述") + version = Column(String(50), nullable=False, comment="算子版本") + inputs = Column(Text, nullable=True, comment="输入定义(JSON)") + outputs = Column(Text, nullable=True, comment="输出定义(JSON)") + runtime = Column(Text, nullable=True, comment="运行时配置(JSON)") + settings = Column(Text, nullable=True, comment="算子设置(JSON)") + file_name = Column(String(255), nullable=True, comment="文件名") + file_size = Column(BigInteger, nullable=True, comment="文件大小(字节)") + metrics = Column(Text, nullable=True, comment="算子指标(JSON)") + usage_count = Column(Integer, default=0, nullable=False, comment="使用次数") + is_star = Column(Boolean, default=False, nullable=False, comment="是否收藏") + + __table_args__ = ( + Index("idx_is_star", "is_star"), + ) + + +class Category(BaseEntity): + """算子分类实体""" + __tablename__ = "t_operator_category" + + id = Column(String(36), primary_key=True, index=True, comment="分类ID") + name = Column(String(255), nullable=False, comment="分类名称") + value = Column(String(255), nullable=True, comment="分类值") + type = Column(String(50), nullable=True, comment="分类类型") + parent_id = Column(String(36), nullable=False, default="0", comment="父分类ID") + + +class CategoryRelation(BaseEntity): + """算子分类关系实体""" + __tablename__ = "t_operator_category_relation" + + category_id = Column(String(36), primary_key=True, comment="分类ID") + operator_id = Column(String(36), primary_key=True, comment="算子ID") + + __table_args__ = ( + Index("idx_category_id", "category_id"), + Index("idx_operator_id", "operator_id"), + ) + + +class OperatorRelease(BaseEntity): + """算子发布版本实体""" + __tablename__ = "t_operator_release" + + id = Column(String(36), primary_key=True, comment="算子ID") + version = Column(String(50), primary_key=True, comment="版本号") + release_date = Column(TIMESTAMP, nullable=False, default=func.now(), comment="发布时间") + changelog = Column(JSON, nullable=True, comment="更新日志列表") + + +# Ignore data scope for operator models +for model in [Operator, Category, CategoryRelation, OperatorRelease]: + model.__ignore_data_scope__ = True diff --git a/runtime/datamate-python/app/module/__init__.py b/runtime/datamate-python/app/module/__init__.py index 7d3c482b..9437b11d 100644 --- a/runtime/datamate-python/app/module/__init__.py +++ b/runtime/datamate-python/app/module/__init__.py @@ -7,6 +7,8 @@ from .evaluation.interface import router as evaluation_router from .collection.interface import router as collection_route from .rag.interface.rag_interface import router as rag_router +from .operator.interface import operator_router +from .operator.interface import category_router router = APIRouter( prefix="/api" @@ -19,5 +21,7 @@ router.include_router(evaluation_router) router.include_router(collection_route) router.include_router(rag_router) +router.include_router(operator_router) +router.include_router(category_router) __all__ = ["router"] diff --git a/runtime/datamate-python/app/module/operator/README.md b/runtime/datamate-python/app/module/operator/README.md new file mode 100644 index 00000000..703e8ed3 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/README.md @@ -0,0 +1,138 @@ +# Operator Market Service - Python Implementation + +## 概述 + +这是 `operator-market-service` 的 Python 实现,已集成到 `runtime/datamate-python` 项目中。 + +## 功能 + +- **算子管理**:创建、查询、更新、删除算子 +- **分类管理**:树状分类结构查询 +- **文件上传**:支持算子文件上传和解析(支持 tar/zip 格式) +- **MCP 工具集成**:通过 fastapi-mcp 提供 MCP 工具接口 + +## 目录结构 + +``` +app/module/operator_market/ +├── __init__.py # 模块入口 +├── constants.py # 常量定义 +├── exceptions.py # 异常定义 +├── schema/ # Pydantic Schema 定义 +│ ├── __init__.py +│ ├── operator.py # 算子相关 Schema +│ ├── category.py # 分类相关 Schema +│ └── release.py # 发布版本 Schema +├── parsers/ # 文件解析器 +│ ├── __init__.py +│ ├── abstract_parser.py # 抽象解析器基类 +│ ├── tar_parser.py # TAR 文件解析器 +│ ├── zip_parser.py # ZIP 文件解析器 +│ └── parser_holder.py # 解析器持有者 +├── repository/ # 数据访问层 +│ ├── __init__.py +│ ├── operator_repository.py +│ ├── category_repository.py +│ ├── category_relation_repository.py +│ └── operator_release_repository.py +├── service/ # 服务层 +│ ├── __init__.py +│ ├── operator_service.py +│ └── category_service.py +└── interface/ # API 接口层 + ├── __init__.py + ├── operator_routes.py + └── category_routes.py +``` + +## API 端点 + +### 算子相关 (`/api/operator-market/operators`) + +| 方法 | 路径 | 描述 | +|------|--------|------| +| POST | `/list` | 查询算子列表(支持分页、分类过滤、关键词搜索) | +| GET | `/{operator_id}` | 获取算子详情 | +| PUT | `/{operator_id}` | 更新算子信息 | +| POST | `/create` | 创建新算子 | +| POST | `/upload` | 上传算子文件 | +| POST | `/upload/pre-upload` | 预上传(获取请求 ID) | +| POST | `/upload/chunk` | 分块上传 | +| DELETE | `/{operator_id}` | 删除算子 | +| GET | `/examples/download` | 下载示例算子 | + +### 分类相关 (`/api/operator-market/categories`) + +| 方法 | 路径 | 描述 | +|------|--------|------| +| GET | `/tree` | 获取分类树状结构 | + +## 数据库表 + +- `t_operator` - 算子表 +- `t_operator_category` - 分类表 +- `t_operator_category_relation` - 分类关系表 +- `t_operator_release` - 算子发布版本表 +- `v_operator` - 算子视图(包含分类信息) + +## 文件格式支持 + +算子文件需包含 `metadata.yml` 文件,格式如下: + +```yaml +raw_id: "operator-id" +name: "算子名称" +description: "算子描述" +version: "1.0.0" +language: "python" # python, java +modal: "text" # text, image, audio, video +vendor: "datamate" # datamate, data-juicer, or other +inputs: {...} +outputs: {...} +runtime: {...} +settings: {...} +metrics: {...} +release: + - "更新日志1" + - "更新日志2" +``` + +## 待实现功能 + +- [ ] 算子收藏功能完善 +- [ ] 标签过滤功能 + +## 使用示例 + +### 查询算子列表 + +```bash +curl -X POST "http://localhost:18000/api/operator-market/operators/list" \ + -H "Content-Type: application/json" \ + -d '{ + "page": 1, + "size": 10, + "keyword": "test", + "isStar": false + }' +``` + +### 获取分类树 + +```bash +curl -X GET "http://localhost:18000/api/operator-market/categories/tree" +``` + +### 创建算子 + +```bash +curl -X POST "http://localhost:18000/api/operator-market/operators/create" \ + -H "Content-Type: application/json" \ + -d '{ + "id": "new-operator-id", + "name": "新算子", + "description": "这是一个新算子", + "version": "1.0.0", + "fileName": "operator.tar" + }' +``` diff --git a/runtime/datamate-python/app/module/operator/__init__.py b/runtime/datamate-python/app/module/operator/__init__.py new file mode 100644 index 00000000..1ac84e31 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/__init__.py @@ -0,0 +1,4 @@ +""" +Operator Market Service Module +算子市场服务模块 +""" diff --git a/runtime/datamate-python/app/module/operator/constants.py b/runtime/datamate-python/app/module/operator/constants.py new file mode 100644 index 00000000..e6d83ee9 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/constants.py @@ -0,0 +1,50 @@ +""" +Operator Market Constants +算子市场常量定义 +""" + +# Service ID +SERVICE_ID = "operator" + +# YAML metadata path +YAML_PATH = "metadata.yml" + +# Example operator file path +EXAMPLE_OPERATOR_PATH = "/app/test_operator.tar" + +# Category IDs +CATEGORY_PYTHON = "python" +CATEGORY_PYTHON_ID = "9eda9d5d-072b-499b-916c-797a0a8750e1" + +CATEGORY_JAVA = "java" +CATEGORY_JAVA_ID = "b5bfc548-8ef6-417c-b8a6-a4197c078249" + +CATEGORY_CUSTOMIZED_ID = "ec2cdd17-8b93-4a81-88c4-ac9e98d10757" +CATEGORY_TEXT_ID = "d8a5df7a-52a9-42c2-83c4-01062e60f597" +CATEGORY_IMAGE_ID = "de36b61c-9e8a-4422-8c31-d30585c7100f" +CATEGORY_AUDIO_ID = "42dd9392-73e4-458c-81ff-41751ada47b5" +CATEGORY_VIDEO_ID = "a233d584-73c8-4188-ad5d-8f7c8dda9c27" +CATEGORY_ALL_ID = "4d7dbd77-0a92-44f3-9056-2cd62d4a71e4" +CATEGORY_STAR_ID = "51847c24-bba9-11f0-888b-5b143cb738aa" +CATEGORY_PREDEFINED_ID = "96a3b07a-3439-4557-a835-525faad60ca3" +CATEGORY_DATAMATE_ID = "431e7798-5426-4e1a-aae6-b9905a836b34" +CATEGORY_DATA_JUICER_ID = "79b385b4-fde8-4617-bcba-02a176938996" +CATEGORY_OTHER_VENDOR_ID = "f00eaa3e-96c1-4de4-96cd-9848ef5429ec" + +# Category mapping +CATEGORY_MAP = { + CATEGORY_PYTHON: CATEGORY_PYTHON_ID, + CATEGORY_JAVA: CATEGORY_JAVA_ID, + "text": CATEGORY_TEXT_ID, + "image": CATEGORY_IMAGE_ID, + "audio": CATEGORY_AUDIO_ID, + "video": CATEGORY_VIDEO_ID, + "all": CATEGORY_ALL_ID, + "datamate": CATEGORY_DATAMATE_ID, + "data-juicer": CATEGORY_DATA_JUICER_ID, +} + +# File paths +OPERATOR_BASE_PATH = "/operators" +UPLOAD_DIR = "upload" +EXTRACT_DIR = "extract" diff --git a/runtime/datamate-python/app/module/operator/exceptions.py b/runtime/datamate-python/app/module/operator/exceptions.py new file mode 100644 index 00000000..6eca13f5 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/exceptions.py @@ -0,0 +1,72 @@ +""" +Operator Market Exceptions +算子市场异常定义 +""" +from enum import Enum +from typing import Optional + + +class OperatorErrorCode: + """算子错误码""" + def __init__(self, message: str, error_code: str): + self.message = message + self.error_code = error_code + + +class OperatorException(RuntimeError): + """算子异常基类""" + def __init__(self, operator_error_code: OperatorErrorCode): + self.message = operator_error_code.message + self.error_code = operator_error_code.error_code + super().__init__(self.message) + + +class OperatorErrorCodeEnum(Enum): + """算子错误码枚举""" + FIELD_NOT_FOUND = OperatorErrorCode( + "必填字段缺失", "OPERATOR_FIELD_NOT_FOUND" + ) + SETTINGS_PARSE_FAILED = OperatorErrorCode( + "设置解析失败", "OPERATOR_SETTINGS_PARSE_FAILED" + ) + OPERATOR_IN_INSTANCE = OperatorErrorCode( + "算子正在使用中", "OPERATOR_IN_INSTANCE" + ) + CANT_DELETE_PREDEFINED_OPERATOR = OperatorErrorCode( + "无法删除预定义算子", "CANT_DELETE_PREDEFINED_OPERATOR" + ) + + +class FieldNotFoundError(OperatorException): + """必填字段缺失""" + def __init__(self, field_name: str): + super().__init__( + OperatorErrorCodeEnum.FIELD_NOT_FOUND.value + ) + self.message = f"Required field '{field_name}' is missing" + self.field_name = field_name + + +class SettingsParseError(OperatorException): + """设置解析失败""" + def __init__(self, detail: Optional[str] = None): + super().__init__( + OperatorErrorCodeEnum.SETTINGS_PARSE_FAILED.value + ) + self.detail = detail + + +class OperatorInInstanceError(OperatorException): + """算子正在使用中""" + def __init__(self): + super().__init__( + OperatorErrorCodeEnum.OPERATOR_IN_INSTANCE.value + ) + + +class CannotDeletePredefinedOperatorError(OperatorException): + """无法删除预定义算子""" + def __init__(self): + super().__init__( + OperatorErrorCodeEnum.CANT_DELETE_PREDEFINED_OPERATOR.value + ) diff --git a/runtime/datamate-python/app/module/operator/interface/__init__.py b/runtime/datamate-python/app/module/operator/interface/__init__.py new file mode 100644 index 00000000..f83ad24f --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/__init__.py @@ -0,0 +1,9 @@ +""" +Operator Market API Interfaces +算子市场 API 接口层 +""" +from .operator_routes import router as operator_router +from .category_routes import router as category_router + + +__all__ = ["operator_router", "category_router"] diff --git a/runtime/datamate-python/app/module/operator/interface/category_routes.py b/runtime/datamate-python/app/module/operator/interface/category_routes.py new file mode 100644 index 00000000..ed4207e0 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/category_routes.py @@ -0,0 +1,43 @@ +""" +Category API Routes +分类 API 路由 +""" +from fastapi import APIRouter, Depends + +from app.db.session import get_db +from app.module.shared.schema import StandardResponse +from app.module.operator.schema import CategoryTreePagedResponse +from app.module.operator.service import CategoryService +from app.module.operator.repository import ( + CategoryRepository, + CategoryRelationRepository, +) +from app.module.operator.repository.operator_repository import OperatorRepository +from app.db.models.operator import Category, CategoryRelation, Operator + +router = APIRouter(prefix="/categories", tags=["Category"]) + + +def get_category_service() -> CategoryService: + """获取分类服务实例""" + return CategoryService( + category_repo=CategoryRepository(Category()), + category_relation_repo=CategoryRelationRepository(CategoryRelation()), + operator_repo=OperatorRepository(Operator()), + ) + + +@router.get( + "/tree", + response_model=StandardResponse[CategoryTreePagedResponse], + summary="获取分类树", + description="获取算子树状分类结构,包含分组维度(如语言、模态)及资源统计数量", + tags=['mcp'] +) +async def get_category_tree( + service: CategoryService = Depends(get_category_service), + db=Depends(get_db) +): + """获取分类树""" + result = await service.get_all_categories(db) + return StandardResponse(code=200, message="success", data=result) diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py new file mode 100644 index 00000000..8a1911d2 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -0,0 +1,270 @@ +""" +Operator API Routes +算子 API 路由 +""" +from pathlib import Path +from typing import List, Optional + +from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form +from fastapi.responses import FileResponse + +from app.db.session import get_db +from app.module.shared.schema import StandardResponse, PaginatedData +from app.module.operator.schema import ( + OperatorDto, + OperatorUpdateDto, + OperatorListRequest, + PreUploadResponse, +) +from app.module.operator.service import OperatorService +from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, +) +from app.module.operator.parsers import ParserHolder +from app.db.models.operator import Operator, CategoryRelation, OperatorRelease +from app.core.logging import get_logger +from app.module.shared.file_service import FileService +from app.module.shared.chunk_upload_repository import ChunkUploadRepository +from app.db.models.chunk_upload import ChunkUploadPreRequest + +logger = get_logger(__name__) + +router = APIRouter(prefix="/operators", tags=["Operator"]) + +def get_operator_service() -> OperatorService: + """获取算子服务实例""" + return OperatorService( + operator_repo=OperatorRepository(Operator()), + category_relation_repo=CategoryRelationRepository(CategoryRelation()), + operator_release_repo=OperatorReleaseRepository(OperatorRelease()), + parser_holder=ParserHolder(), + file_service=FileService(ChunkUploadRepository()), + ) + + +@router.post( + "/list", + response_model=StandardResponse[PaginatedData[OperatorDto]], + summary="查询算子列表", + description="根据参数查询算子列表(支持分页、分类过滤、关键词搜索)", + tags=['mcp'] +) +async def list_operators( + request: OperatorListRequest, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """查询算子列表""" + operators = await service.get_operators( + page=request.page, + size=request.size, + categories=request.categories, + keyword=request.keyword, + is_star=request.is_star, + db=db + ) + + count = await service.count_operators( + categories=request.categories, + keyword=request.keyword, + is_star=request.is_star, + db=db + ) + + total_pages = (count + request.size - 1) // request.size # Ceiling division + + return StandardResponse( + code=200, + message="success", + data=PaginatedData( + page=request.page, + size=request.size, + total_elements=count, + total_pages=total_pages, + content=operators, + ) + ) + + +@router.get( + "/{operator_id}", + response_model=StandardResponse[OperatorDto], + summary="获取算子详情", + description="根据 ID 获取算子详细信息" +) +async def get_operator( + operator_id: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """获取算子详情""" + try: + operator = await service.get_operator_by_id(operator_id, db) + return StandardResponse(code=200, message="success", data=operator) + except ValueError as e: + raise HTTPException(status_code=404, detail=str(e)) + + +@router.put( + "/{operator_id}", + response_model=StandardResponse[OperatorDto], + summary="更新算子", + description="更新算子信息" +) +async def update_operator( + operator_id: str, + request: OperatorUpdateDto, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """更新算子""" + try: + operator = await service.update_operator(operator_id, request, db) + await db.commit() + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + logger.error(f"{operator_id} {request}", e) + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/create", + response_model=StandardResponse[OperatorDto], + summary="创建算子", + description="创建新算子" +) +async def create_operator( + request: OperatorDto, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """创建算子""" + try: + operator = await service.create_operator(request, db) + await db.commit() + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/upload", + response_model=StandardResponse[OperatorDto], + summary="上传算子", + description="上传算子文件并解析元数据" +) +async def upload_operator( + file_name: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """上传算子""" + try: + operator = await service.upload_operator(file_name, db) + return StandardResponse(code=200, message="success", data=operator) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/upload/pre-upload", + response_model=StandardResponse[PreUploadResponse], + summary="预上传", + description="获取预上传 ID,用于分块上传" +) +async def pre_upload( + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """预上传""" + result = await service.pre_upload(db) + return StandardResponse( + code=200, + message="success", + data=PreUploadResponse(req_id=result["req_id"]) + ) + + +@router.post( + "/upload/chunk", + response_model=StandardResponse[dict], + summary="分块上传", + description="分块上传算子文件" +) +async def chunk_upload( + req_id: str = Form(..., description="预上传ID"), + file_no: int = Form(1, description="文件编号"), + file_name: str = Form(..., description="文件名"), + total_chunk_num: int = Form(1, description="总分块数"), + chunk_no: int = Form(1, description="当前分块号"), + file: UploadFile = ..., + check_sum_hex: Optional[str] = Form(None, description="校验和"), + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """分块上传""" + try: + file_content = await file.read() + result = await service.chunk_upload( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + file_content=file_content, + db=db + ) + await db.commit() + return StandardResponse(code=200, message="success", data=result.dict()) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.delete( + "/{operator_id}", + response_model=StandardResponse[None], + summary="删除算子", + description="删除算子" +) +async def delete_operator( + operator_id: str, + service: OperatorService = Depends(get_operator_service), + db=Depends(get_db) +): + """删除算子""" + try: + await service.delete_operator(operator_id, db) + await db.commit() + return StandardResponse(code=200, message="success", data=None) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/examples/download", + response_class=FileResponse, + summary="下载示例算子", + description="下载示例算子文件" +) +async def download_example_operator( + service: OperatorService = Depends(get_operator_service) +): + """下载示例算子""" + from app.module.operator.constants import EXAMPLE_OPERATOR_PATH + example_path = EXAMPLE_OPERATOR_PATH + try: + file_path = service.download_example_operator(example_path) + return FileResponse( + path=str(file_path), + filename=file_path.name, + media_type="application/octet-stream" + ) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Example file not found") diff --git a/runtime/datamate-python/app/module/operator/parsers/__init__.py b/runtime/datamate-python/app/module/operator/parsers/__init__.py new file mode 100644 index 00000000..db3c0504 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/__init__.py @@ -0,0 +1,15 @@ +""" +Operator File Parsers +算子文件解析器 +""" +from .abstract_parser import AbstractParser +from .tar_parser import TarParser +from .zip_parser import ZipParser +from .parser_holder import ParserHolder + +__all__ = [ + "AbstractParser", + "TarParser", + "ZipParser", + "ParserHolder", +] diff --git a/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py new file mode 100644 index 00000000..27e9aa3c --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py @@ -0,0 +1,97 @@ +""" +Abstract Parser +抽象解析器基类 +""" +import json +import yaml +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional + +from app.module.operator.schema import OperatorDto, OperatorReleaseDto +from app.module.operator.constants import CATEGORY_MAP, CATEGORY_OTHER_VENDOR_ID, CATEGORY_CUSTOMIZED_ID +from app.module.operator.exceptions import FieldNotFoundError + + +class AbstractParser(ABC): + """算子文件解析器抽象基类""" + + @abstractmethod + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """ + 从压缩包内读取指定路径的 yaml 文件并解析为 OperatorDto + + Args: + archive_path: 压缩包路径(zip 或 tar) + entry_path: 压缩包内部的文件路径,例如 "config/app.yaml" + + Returns: + 解析后的 OperatorDto + """ + pass + + @abstractmethod + def extract_to(self, archive_path: str, target_dir: str) -> None: + """ + 将压缩包解压到目标目录(保持相对路径) + + Args: + archive_path: 压缩包路径 + target_dir: 目标目录 + """ + pass + + def parse_yaml(self, yaml_content: str) -> OperatorDto: + """解析 YAML 内容为 OperatorDto""" + content: Dict[str, Any] = yaml.safe_load(yaml_content) + + operator = OperatorDto( + id=self._to_string(content.get("raw_id")), + name=self._to_string(content.get("name")), + description=self._to_string(content.get("description")), + version=self._to_string(content.get("version")), + inputs=self._to_json(content.get("inputs")), + outputs=self._to_json(content.get("outputs")), + runtime=self._to_json(content.get("runtime")), + settings=self._to_json(content.get("settings")), + metrics=self._to_json(content.get("metrics")), + ) + + # Handle changelog + changelog = content.get("release") + if isinstance(changelog, list): + operator_release = OperatorReleaseDto(changelog=changelog) + else: + operator_release = OperatorReleaseDto(changelog=[]) + operator.releases = [operator_release] + + # Build categories + categories = [ + CATEGORY_MAP.get(self._to_lower(content.get("language")), ""), + CATEGORY_MAP.get(self._to_lower(content.get("modal")), ""), + CATEGORY_MAP.get(self._to_lower(content.get("vendor")), CATEGORY_OTHER_VENDOR_ID), + CATEGORY_CUSTOMIZED_ID, + ] + operator.categories = categories + + return operator + + def _to_string(self, obj: Any) -> str: + """转换为字符串""" + if obj is None: + raise FieldNotFoundError("field") + return str(obj) + + def _to_lower(self, obj: Any) -> str: + """转换为小写字符串""" + if obj is None: + raise FieldNotFoundError("field") + return str(obj).lower() + + def _to_json(self, obj: Any) -> Optional[str]: + """转换为 JSON 字符串""" + if obj is None: + return None + try: + return json.dumps(obj) + except (TypeError, ValueError) as e: + raise ValueError(f"Failed to serialize to JSON: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/parser_holder.py b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py new file mode 100644 index 00000000..e4a79d63 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py @@ -0,0 +1,52 @@ +""" +Parser Holder +解析器持有者,根据文件类型选择合适的解析器 +""" +import os +from typing import Dict, Type + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.parsers.tar_parser import TarParser +from app.module.operator.parsers.zip_parser import ZipParser +from app.module.operator.schema import OperatorDto + + +class ParserHolder: + """解析器持有者,根据文件类型选择解析器""" + + def __init__(self): + self._parsers: Dict[str, AbstractParser] = { + "tar": TarParser(), + "gz": TarParser(), + "tgz": TarParser(), + "zip": ZipParser(), + } + + def get_parser(self, file_path: str) -> AbstractParser: + """根据文件扩展名获取解析器""" + _, ext = os.path.splitext(file_path) + file_type = ext.lstrip('.').lower() + + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + return self._parsers[file_type] + + def parse_yaml_from_archive( + self, + file_type: str, + archive_path: str, + entry_path: str + ) -> OperatorDto: + """从压缩包解析 YAML""" + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + return self._parsers[file_type].parse_yaml_from_archive(archive_path, entry_path) + + def extract_to(self, file_type: str, archive_path: str, target_dir: str) -> None: + """解压文件到目标目录""" + if file_type not in self._parsers: + raise ValueError(f"Unsupported file type: {file_type}") + + self._parsers[file_type].extract_to(archive_path, target_dir) diff --git a/runtime/datamate-python/app/module/operator/parsers/tar_parser.py b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py new file mode 100644 index 00000000..e2618cfa --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py @@ -0,0 +1,41 @@ +""" +Tar File Parser +TAR 文件解析器 +""" +import tarfile +import os +from typing import Optional + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.schema import OperatorDto + + +class TarParser(AbstractParser): + """TAR 压缩包解析器""" + + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """从 TAR 文件中解析 YAML""" + try: + with tarfile.open(archive_path, 'r:*') as tar: + for member in tar.getmembers(): + if member.name == entry_path or member.name.endswith(f"/{entry_path}"): + file = tar.extractfile(member) + if file: + content = file.read().decode('utf-8') + return self.parse_yaml(content) + raise FileNotFoundError(f"File '{entry_path}' not found in archive") + except (tarfile.TarError, EOFError) as e: + raise ValueError(f"Failed to parse TAR file: {e}") + + def extract_to(self, archive_path: str, target_dir: str) -> None: + """解压 TAR 文件到目标目录""" + try: + os.makedirs(target_dir, exist_ok=True) + with tarfile.open(archive_path, 'r:*') as tar: + # Safety check: prevent path traversal + for member in tar.getmembers(): + if os.path.isabs(member.name) or ".." in member.name.split("/"): + raise ValueError(f"Unsafe path in archive: {member.name}") + tar.extractall(target_dir) + except (tarfile.TarError, EOFError) as e: + raise ValueError(f"Failed to extract TAR file: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/zip_parser.py b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py new file mode 100644 index 00000000..a1741efe --- /dev/null +++ b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py @@ -0,0 +1,41 @@ +""" +Zip File Parser +ZIP 文件解析器 +""" +import zipfile +import os +from typing import Optional + +from app.module.operator.parsers.abstract_parser import AbstractParser +from app.module.operator.schema import OperatorDto + + +class ZipParser(AbstractParser): + """ZIP 压缩包解析器""" + + def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + """从 ZIP 文件中解析 YAML""" + try: + with zipfile.ZipFile(archive_path, 'r') as zf: + # Check all possible paths + for name in zf.namelist(): + if name == entry_path or name.endswith(f"/{entry_path}"): + with zf.open(name) as file: + content = file.read().decode('utf-8') + return self.parse_yaml(content) + raise FileNotFoundError(f"File '{entry_path}' not found in archive") + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + raise ValueError(f"Failed to parse ZIP file: {e}") + + def extract_to(self, archive_path: str, target_dir: str) -> None: + """解压 ZIP 文件到目标目录""" + try: + os.makedirs(target_dir, exist_ok=True) + with zipfile.ZipFile(archive_path, 'r') as zf: + # Safety check: prevent path traversal + for name in zf.namelist(): + if os.path.isabs(name) or ".." in name.split("/"): + raise ValueError(f"Unsafe path in archive: {name}") + zf.extractall(target_dir) + except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: + raise ValueError(f"Failed to extract ZIP file: {e}") diff --git a/runtime/datamate-python/app/module/operator/repository/__init__.py b/runtime/datamate-python/app/module/operator/repository/__init__.py new file mode 100644 index 00000000..67859d72 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/__init__.py @@ -0,0 +1,15 @@ +""" +Operator Market Repositories +算子市场数据访问层 +""" +from .operator_repository import OperatorRepository +from .category_repository import CategoryRepository +from .category_relation_repository import CategoryRelationRepository +from .operator_release_repository import OperatorReleaseRepository + +__all__ = [ + "OperatorRepository", + "CategoryRepository", + "CategoryRelationRepository", + "OperatorReleaseRepository", +] diff --git a/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py new file mode 100644 index 00000000..1edd5868 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py @@ -0,0 +1,77 @@ +""" +Category Relation Repository +分类关系数据访问层 +""" +from typing import List + +from sqlalchemy import select, delete, and_ +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import CategoryRelation +from app.module.operator.constants import CATEGORY_PREDEFINED_ID + + +class CategoryRelationRepository: + """分类关系数据访问层""" + + def __init__(self, model: CategoryRelation): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[CategoryRelation]: + """查询所有分类关系""" + result = await db.execute(select(self.model)) + return result.scalars().all() + + async def batch_insert( + self, + operator_id: str, + category_ids: List[str], + db: AsyncSession + ) -> None: + """批量插入分类关系""" + for category_id in category_ids: + entity = CategoryRelation( + category_id=category_id, + operator_id=operator_id + ) + db.add(entity) + + async def batch_update( + self, + operator_id: str, + category_ids: List[str], + db: AsyncSession + ) -> None: + """批量更新分类关系(先删除后插入)""" + # Delete existing relations + await db.execute( + delete(self.model) + .where(self.model.operator_id == operator_id) + ) + # Insert new relations + for category_id in category_ids: + entity = CategoryRelation( + category_id=category_id, + operator_id=operator_id + ) + db.add(entity) + + async def delete_by_operator_id(self, operator_id: str, db: AsyncSession) -> None: + """根据算子ID删除分类关系""" + await db.execute( + delete(self.model) + .where(self.model.operator_id == operator_id) + ) + + async def operator_is_predefined(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否为预定义算子""" + result = await db.execute( + select(self.model) + .where( + and_( + self.model.operator_id == operator_id, + self.model.category_id == CATEGORY_PREDEFINED_ID + ) + ) + ) + return result.first() is not None diff --git a/runtime/datamate-python/app/module/operator/repository/category_repository.py b/runtime/datamate-python/app/module/operator/repository/category_repository.py new file mode 100644 index 00000000..b5434d34 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/category_repository.py @@ -0,0 +1,23 @@ +""" +Category Repository +分类数据访问层 +""" +from typing import List + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import Category +from app.module.operator.schema import CategoryDto + + +class CategoryRepository: + """分类数据访问层""" + + def __init__(self, model: Category): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[Category]: + """查询所有分类""" + result = await db.execute(select(self.model)) + return result.scalars().all() diff --git a/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py new file mode 100644 index 00000000..bcab7be8 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/operator_release_repository.py @@ -0,0 +1,72 @@ +""" +Operator Release Repository +算子发布版本数据访问层 +""" +from typing import List + +from sqlalchemy import select, delete, and_ +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import OperatorRelease +from app.module.operator.schema import OperatorReleaseDto + + +class OperatorReleaseRepository: + """算子发布版本数据访问层""" + + def __init__(self, model: OperatorRelease): + self.model = model + + async def find_all_by_operator_id( + self, + operator_id: str, + db: AsyncSession + ) -> List[OperatorRelease]: + """查询算子的所有发布版本""" + result = await db.execute( + select(OperatorRelease) + .where(OperatorRelease.id == operator_id) + .order_by(OperatorRelease.release_date.desc()) + ) + return result.scalars().all() + + async def insert( + self, + dto: OperatorReleaseDto, + db: AsyncSession + ) -> None: + """插入发布版本""" + entity = OperatorRelease( + id=dto.id, + version=dto.version, + release_date=dto.release_date, + changelog=dto.changelog + ) + db.add(entity) + + async def update( + self, + dto: OperatorReleaseDto, + db: AsyncSession + ) -> None: + """更新发布版本""" + result = await db.execute( + select(OperatorRelease) + .where( + and_( + OperatorRelease.id == dto.id, + OperatorRelease.version == dto.version + ) + ) + ) + entity = result.scalar_one_or_none() + if entity: + entity.changelog = dto.changelog + entity.release_date = dto.release_date + + async def delete(self, operator_id: str, db: AsyncSession) -> None: + """删除算子的所有发布版本""" + await db.execute( + delete(OperatorRelease) + .where(OperatorRelease.id == operator_id) + ) diff --git a/runtime/datamate-python/app/module/operator/repository/operator_repository.py b/runtime/datamate-python/app/module/operator/repository/operator_repository.py new file mode 100644 index 00000000..990f7eb3 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/repository/operator_repository.py @@ -0,0 +1,121 @@ +""" +Operator Repository +算子数据访问层 +""" +import json +from typing import List, Optional +from datetime import datetime, timezone + +from sqlalchemy import select, text, update +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.operator import Operator +from app.module.operator.schema import OperatorDto + + +class OperatorRepository: + """算子数据访问层""" + + def __init__(self, model: Operator): + self.model = model + + async def find_all(self, db: AsyncSession) -> List[Operator]: + """查询所有算子""" + result = await db.execute(select(Operator)) + return result.scalars().all() + + async def insert(self, dto: OperatorDto, db: AsyncSession) -> None: + """插入算子""" + entity = Operator( + id=dto.id, + name=dto.name, + description=dto.description, + version=dto.version, + inputs=dto.inputs, + outputs=dto.outputs, + runtime=dto.runtime, + settings=dto.settings, + file_name=dto.file_name, + file_size=dto.file_size, + metrics=dto.metrics, + usage_count=dto.usage_count or 0, + is_star=dto.is_star or False, + ) + db.add(entity) + + async def update(self, dto: OperatorDto, db: AsyncSession) -> None: + """更新算子""" + await db.execute( + update(Operator) + .where(Operator.id == dto.id) + .values( + name=dto.name, + description=dto.description, + version=dto.version, + inputs=dto.inputs, + outputs=dto.outputs, + runtime=dto.runtime, + settings=dto.settings, + file_name=dto.file_name, + file_size=dto.file_size, + metrics=dto.metrics, + is_star=dto.is_star, + updated_at=datetime.utcnow(), + ) + ) + + async def delete(self, operator_id: str, db: AsyncSession) -> None: + """删除算子""" + entity = await db.get(Operator, operator_id) + if entity: + await db.delete(entity) + + async def count_by_star(self, is_star: bool, db: AsyncSession) -> int: + """统计收藏算子数量""" + result = await db.execute( + select(text("COUNT(*)")) + .select_from(Operator) + .where(Operator.is_star == is_star) + ) + return result.scalar() or 0 + + async def operator_in_template(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否在模板中""" + result = await db.execute( + text(""" + SELECT COUNT(*) FROM t_operator_instance oi + JOIN t_clean_template t ON oi.instance_id = t.id + WHERE oi.operator_id = :operator_id + """), + {"operator_id": operator_id} + ) + return (result.scalar() or 0) > 0 + + async def operator_in_unstop_task(self, operator_id: str, db: AsyncSession) -> bool: + """检查算子是否在未完成的任务中""" + result = await db.execute( + text(""" + SELECT COUNT(*) FROM t_operator_instance oi + JOIN t_clean_task t ON oi.instance_id = t.id + WHERE oi.operator_id = :operator_id AND t.status != 'COMPLETED' + """), + {"operator_id": operator_id} + ) + return (result.scalar() or 0) > 0 + + async def increment_usage_count( + self, + operator_ids: List[str], + db: AsyncSession + ) -> None: + """增加算子使用次数""" + if not operator_ids: + return + await db.execute( + update(Operator) + .where(Operator.id.in_(operator_ids)) + .values( + usage_count=Operator.usage_count + 1, + updated_at=datetime.now(timezone.utc), + ) + ) diff --git a/runtime/datamate-python/app/module/operator/schema/__init__.py b/runtime/datamate-python/app/module/operator/schema/__init__.py new file mode 100644 index 00000000..a084cbaf --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/__init__.py @@ -0,0 +1,29 @@ +""" +Operator Market Schemas +算子市场 Schema 定义 +""" +from .operator import ( + OperatorDto, + OperatorListRequest, + PreUploadResponse, + OperatorUpdateDto, +) +from .category import ( + CategoryDto, + CategoryTreeResponse, + CategoryTreePagedResponse, + CategoryRelationDto, +) +from .release import OperatorReleaseDto + +__all__ = [ + "OperatorDto", + "OperatorListRequest", + "PreUploadResponse", + "CategoryDto", + "CategoryTreeResponse", + "CategoryTreePagedResponse", + "CategoryRelationDto", + "OperatorReleaseDto", + "OperatorUpdateDto", +] diff --git a/runtime/datamate-python/app/module/operator/schema/category.py b/runtime/datamate-python/app/module/operator/schema/category.py new file mode 100644 index 00000000..afd6e3c5 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/category.py @@ -0,0 +1,40 @@ +""" +Category Schemas +分类 Schema 定义 +""" +from typing import List, Optional +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel + + +class CategoryDto(BaseResponseModel): + """分类 DTO""" + id: str = Field(..., description="分类ID") + name: str = Field(..., description="分类名称") + value: Optional[str] = Field(None, description="分类值") + type: Optional[str] = Field(None, description="分类类型") + parent_id: Optional[str] = Field(None, description="父分类ID") + count: Optional[int] = Field(0, description="算子数量") + created_at: Optional[datetime] = Field(None, description="创建时间") + + +class CategoryTreeResponse(BaseResponseModel): + """分类树响应""" + id: str = Field(..., description="分类ID") + name: str = Field(..., description="分类名称") + count: int = Field(0, description="算子总数") + categories: List[CategoryDto] = Field(default_factory=list, description="子分类列表") + + +class CategoryTreePagedResponse(BaseResponseModel): + """分类树分页响应""" + star_count: int = Field(0, description="收藏的算子数量") + categories: List[CategoryTreeResponse] = Field(default_factory=list, description="分类树列表") + + +class CategoryRelationDto(BaseResponseModel): + """分类关系 DTO""" + category_id: str = Field(..., description="分类ID") + operator_id: str = Field(..., description="算子ID") diff --git a/runtime/datamate-python/app/module/operator/schema/operator.py b/runtime/datamate-python/app/module/operator/schema/operator.py new file mode 100644 index 00000000..c53ed864 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/operator.py @@ -0,0 +1,72 @@ +""" +Operator Schemas +算子 Schema 定义 +""" +from __future__ import annotations + +from typing import List, Optional, Dict, Any +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel +from .release import OperatorReleaseDto + + +class OperatorDto(BaseResponseModel): + """算子 DTO""" + id: str = Field(..., description="算子ID") + name: str = Field(..., description="算子名称") + description: Optional[str] = Field(None, description="算子描述") + version: str = Field(..., description="算子版本") + inputs: Optional[str] = Field(None, description="输入定义(JSON)") + outputs: Optional[str] = Field(None, description="输出定义(JSON)") + runtime: Optional[str] = Field(None, description="运行时配置(JSON)") + settings: Optional[str] = Field(None, description="算子设置(JSON)") + file_name: Optional[str] = Field(None, description="文件名") + file_size: Optional[int] = Field(None, description="文件大小(字节)") + metrics: Optional[str] = Field(None, description="算子指标(JSON)") + usage_count: Optional[int] = Field(None, description="使用次数") + is_star: Optional[bool] = Field(None, description="是否收藏") + categories: Optional[List[str]] = Field(None, description="分类ID列表") + overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值") + requirements: Optional[List[str]] = Field(None, description="Python 依赖列表") + readme: Optional[str] = Field(None, description="README 内容") + releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表") + created_at: Optional[datetime] = Field(None, description="创建时间") + updated_at: Optional[datetime] = Field(None, description="更新时间") + + +class OperatorListRequest(BaseResponseModel): + """算子列表查询请求""" + page: int = Field(1, ge=0, description="页码(从0开始)") + size: int = Field(10, ge=1, le=100, description="页大小") + categories: List[List[str]] = Field(default_factory=list, description="分类ID列表(每个父分类下的id放到一个列表中)") + keyword: Optional[str] = Field(None, description="搜索关键词") + label_name: Optional[str] = Field(None, description="标签名称(暂不支持)") + is_star: Optional[bool] = Field(None, description="是否收藏") + + +class PreUploadResponse(BaseResponseModel): + """预上传响应""" + req_id: str = Field(..., description="请求ID") + + +class OperatorUpdateDto(BaseResponseModel): + """算子更新 DTO(所有字段可选)""" + name: Optional[str] = Field(None, description="算子名称") + description: Optional[str] = Field(None, description="算子描述") + version: Optional[str] = Field(None, description="算子版本") + inputs: Optional[str] = Field(None, description="输入定义(JSON)") + outputs: Optional[str] = Field(None, description="输出定义(JSON)") + runtime: Optional[str] = Field(None, description="运行时配置(JSON)") + settings: Optional[str] = Field(None, description="算子设置(JSON)") + file_name: Optional[str] = Field(None, description="文件名") + file_size: Optional[int] = Field(None, description="文件大小(字节)") + metrics: Optional[str] = Field(None, description="算子指标(JSON)") + usage_count: Optional[int] = Field(None, description="使用次数") + is_star: Optional[bool] = Field(None, description="是否收藏") + categories: Optional[List[str]] = Field(None, description="分类ID列表") + overrides: Optional[Dict[str, Any]] = Field(None, description="设置覆盖值") + requirements: Optional[List[str]] = Field(None, description="Python 依赖列表") + readme: Optional[str] = Field(None, description="README 内容") + releases: Optional[List[OperatorReleaseDto]] = Field(None, description="发布版本列表") diff --git a/runtime/datamate-python/app/module/operator/schema/release.py b/runtime/datamate-python/app/module/operator/schema/release.py new file mode 100644 index 00000000..f91297ee --- /dev/null +++ b/runtime/datamate-python/app/module/operator/schema/release.py @@ -0,0 +1,22 @@ +""" +Operator Release Schemas +算子发布版本 Schema 定义 +""" +from __future__ import annotations + +from typing import List, Optional +from datetime import datetime +from pydantic import BaseModel, Field + +from app.module.shared.schema import BaseResponseModel + + +class OperatorReleaseDto(BaseResponseModel): + """算子发布版本 DTO""" + id: str = Field(..., description="算子ID") + version: str = Field(..., description="版本号") + release_date: Optional[datetime] = Field(None, description="发布时间") + changelog: Optional[List[str]] = Field(None, description="更新日志列表") + + +__all__ = ["OperatorReleaseDto"] diff --git a/runtime/datamate-python/app/module/operator/service/__init__.py b/runtime/datamate-python/app/module/operator/service/__init__.py new file mode 100644 index 00000000..3e1c1d0c --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/__init__.py @@ -0,0 +1,11 @@ +""" +Operator Market Services +算子市场服务层 +""" +from .operator_service import OperatorService +from .category_service import CategoryService + +__all__ = [ + "OperatorService", + "CategoryService", +] diff --git a/runtime/datamate-python/app/module/operator/service/category_service.py b/runtime/datamate-python/app/module/operator/service/category_service.py new file mode 100644 index 00000000..47a654b6 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/category_service.py @@ -0,0 +1,101 @@ +""" +Category Service +分类服务层 +""" +from typing import List + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.module.operator.repository import ( + CategoryRepository, + CategoryRelationRepository, +) +from app.module.operator.schema import ( + CategoryDto, + CategoryTreeResponse, + CategoryTreePagedResponse, +) +from app.db.models.operator import Operator +from app.module.operator.repository.operator_repository import OperatorRepository + + +class CategoryService: + """分类服务""" + + def __init__( + self, + category_repo: CategoryRepository, + category_relation_repo: CategoryRelationRepository, + operator_repo: OperatorRepository, + ): + self.category_repo = category_repo + self.category_relation_repo = category_relation_repo + self.operator_repo = operator_repo + + async def get_all_categories( + self, + db: AsyncSession + ) -> CategoryTreePagedResponse: + """获取所有分类(树状结构)""" + # Get all categories + all_categories = await self.category_repo.find_all(db) + category_map = {c.id: c for c in all_categories} + + # Get all relations and count operators per category + all_relations = await self.category_relation_repo.find_all(db) + relation_map = {} + for rel in all_relations: + if rel.category_id not in relation_map: + relation_map[rel.category_id] = 0 + relation_map[rel.category_id] += 1 + + # Group by parent_id + grouped_by_parent = {} + for cat in all_categories: + if cat.parent_id != "0": + if cat.parent_id not in grouped_by_parent: + grouped_by_parent[cat.parent_id] = [] + grouped_by_parent[cat.parent_id].append(cat) + + # Build category trees + parent_ids = sorted( + grouped_by_parent.keys(), + key=lambda pid: category_map[pid].created_at or 0 + ) + + category_trees = [] + for parent_id in parent_ids: + group = grouped_by_parent[parent_id] + parent_category = category_map[parent_id] + + # Build DTOs for children + child_dtos = [] + total_count = 0 + for cat in sorted(group, key=lambda c: c.created_at or 0): + cat_dto = CategoryDto( + id=cat.id, + name=cat.name, + value=cat.value, + type=cat.type, + parent_id=cat.parent_id, + count=relation_map.get(cat.id, 0), + created_at=cat.created_at, + ) + child_dtos.append(cat_dto) + total_count += cat_dto.count + + tree = CategoryTreeResponse( + id=parent_id, + name=parent_category.name, + count=total_count, + categories=child_dtos, + ) + category_trees.append(tree) + + # Get star count + star_count = await self.operator_repo.count_by_star(True, db) + + return CategoryTreePagedResponse( + star_count=star_count, + categories=category_trees, + ) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py new file mode 100644 index 00000000..17127e58 --- /dev/null +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -0,0 +1,599 @@ +""" +Operator Service +算子服务层 +""" +import json +import os +import uuid +import shutil +from pathlib import Path +from typing import List, Optional, Dict, Any, TYPE_CHECKING + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, text, func + +from app.core.logging import get_logger +from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, +) +from app.module.operator.schema import ( + OperatorDto, + OperatorUpdateDto, + OperatorReleaseDto, +) +from app.module.operator.parsers import ParserHolder +from app.module.operator.constants import ( + OPERATOR_BASE_PATH, + UPLOAD_DIR, + EXTRACT_DIR, + YAML_PATH, + SERVICE_ID, +) +from app.module.operator.exceptions import ( + SettingsParseError, + OperatorInInstanceError, + CannotDeletePredefinedOperatorError, +) +from app.module.shared.file_service import FileService +from app.module.shared.file_models import ( + ChunkUploadRequestDto, + FileUploadResult, +) + +logger = get_logger(__name__) + + +class OperatorService: + """算子服务""" + + def __init__( + self, + operator_repo: OperatorRepository, + category_relation_repo: CategoryRelationRepository, + operator_release_repo: OperatorReleaseRepository, + parser_holder: ParserHolder, + file_service: FileService, + ): + self.operator_repo = operator_repo + self.category_relation_repo = category_relation_repo + self.operator_release_repo = operator_release_repo + self.parser_holder = parser_holder + self.file_service = file_service + + async def get_operators( + self, + page: int, + size: int, + categories: List[List[str]], + keyword: Optional[str], + is_star: Optional[bool], + db: AsyncSession + ) -> List[OperatorDto]: + """查询算子列表(分页)""" + offset = page * size + + # Build query with categories filter + conditions = [] + params = {"limit": size, "offset": offset} + + if is_star is not None: + conditions.append("ov.is_star = :is_star") + params["is_star"] = is_star + + if keyword: + conditions.append( + "(ov.operator_name ILIKE :keyword OR ov.description ILIKE :keyword)" + ) + params["keyword"] = f"%{keyword}%" + + where_clause = "" + if conditions: + where_clause = "WHERE " + " AND ".join(conditions) + + # Handle categories grouping + group_by = "GROUP BY ov.operator_id, ov.operator_name, ov.description, ov.version, " \ + "ov.inputs, ov.outputs, ov.runtime, ov.settings, ov.is_star, " \ + "ov.file_size, ov.usage_count, ov.created_at, ov.updated_at, ov.created_by, ov.updated_by" + + having_clause = "" + if categories: + # Flatten all category IDs for IN clause + all_category_ids = [cat_id for sublist in categories for cat_id in sublist] + if all_category_ids: + where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)" + params["category_ids"] = all_category_ids + + # Build HAVING clause for category groups + having_clauses = [] + for i, cat_group in enumerate(categories): + cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group]) + having_clauses.append( + f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0" + ) + having_clause = "HAVING " + " AND ".join(having_clauses) + + query = f""" + SELECT + ov.operator_id AS id, + ov.operator_name AS name, + ov.description, + ov.version, + ov.inputs, + ov.outputs, + ov.runtime, + ov.settings, + ov.is_star, + ov.file_size, + ov.usage_count, + ov.created_at, + ov.updated_at + FROM v_operator ov + {where_clause} + {group_by} + {having_clause} + ORDER BY ov.created_at DESC + LIMIT :limit OFFSET :offset + """ + + result = await db.execute(text(query), params) + rows = result.fetchall() + + # Convert to DTOs + operators = [] + for row in rows: + operators.append(OperatorDto( + id=row.id, + name=row.name, + description=row.description, + version=row.version, + inputs=row.inputs, + outputs=row.outputs, + runtime=row.runtime, + settings=row.settings, + file_name=None, + file_size=row.file_size, + metrics=None, + usage_count=row.usage_count, + is_star=row.is_star, + created_at=row.created_at, + updated_at=row.updated_at, + )) + + return operators + + async def count_operators( + self, + categories: List[List[str]], + keyword: Optional[str], + is_star: Optional[bool], + db: AsyncSession + ) -> int: + """统计算子数量""" + conditions = [] + params = {} + + if is_star is not None: + conditions.append("is_star = :is_star") + params["is_star"] = is_star + + if keyword: + conditions.append( + "(operator_name ILIKE :keyword OR description ILIKE :keyword)" + ) + params["keyword"] = f"%{keyword}%" + + where_clause = "" + if conditions: + where_clause = "WHERE " + " AND ".join(conditions) + + # Handle categories grouping + group_by = "GROUP BY operator_id, operator_name, description, version, inputs, outputs, " \ + "runtime, settings, is_star, file_size, usage_count, created_at, updated_at, " \ + "created_by, updated_by" + + having_clause = "" + if categories: + # Flatten all category IDs for IN clause + all_category_ids = [cat_id for sublist in categories for cat_id in sublist] + if all_category_ids: + where_clause += " AND category_id = ANY(:category_ids)" if where_clause else "WHERE category_id = ANY(:category_ids)" + params["category_ids"] = all_category_ids + + # Build HAVING clause for category groups + having_clauses = [] + for i, cat_group in enumerate(categories): + cat_list = ", ".join([f"'{cat_id}'" for cat_id in cat_group]) + having_clauses.append( + f"SUM(CASE WHEN category_id IN ({cat_list}) THEN 1 ELSE 0 END) > 0" + ) + having_clause = "HAVING " + " AND ".join(having_clauses) + + query = f""" + SELECT COUNT(*) as count + FROM ( + SELECT operator_id + FROM v_operator + {where_clause} + {group_by} + {having_clause} + ) AS t + """ + + result = await db.execute(text(query), params) + return result.scalar() or 0 + + async def get_operator_by_id( + self, + operator_id: str, + db: AsyncSession + ) -> OperatorDto: + """根据 ID 获取算子详情""" + result = await db.execute( + text("SELECT * FROM v_operator WHERE operator_id = :operator_id"), + {"operator_id": operator_id} + ) + row = result.fetchone() + + if not row: + raise ValueError(f"Operator {operator_id} not found") + + # Build DTO + operator = OperatorDto( + id=row.operator_id, + name=row.operator_name, + description=row.description, + version=row.version, + inputs=row.inputs, + outputs=row.outputs, + runtime=row.runtime, + settings=row.settings, + file_name=row.file_name, + file_size=row.file_size, + metrics=row.metrics, + usage_count=row.usage_count, + is_star=row.is_star, + created_at=row.created_at, + updated_at=row.updated_at, + ) + + # Read requirements and readme if file exists + if row.file_name: + extract_path = self._get_extract_path( + self._get_stem(row.file_name) + ) + operator.requirements = self._read_requirements(extract_path) + operator.readme = self._get_readme_content(extract_path) + + operator.file_name = None # Don't return file_name + + # Load releases + releases = await self.operator_release_repo.find_all_by_operator_id( + operator_id, db + ) + operator.releases = [ + OperatorReleaseDto( + id=release.id, + version=release.version, + release_date=release.release_date, + changelog=release.changelog + ) + for release in releases + ] + + return operator + + async def create_operator( + self, + req: OperatorDto, + db: AsyncSession + ) -> OperatorDto: + """创建算子""" + from datetime import datetime, timezone + + # Generate ID if not provided + if not req.id: + req.id = str(uuid.uuid4()) + + # Override settings + self._override_settings(req) + + # Insert operator + await self.operator_repo.insert(req, db) + + # Insert category relations + if req.categories: + await self.category_relation_repo.batch_insert( + req.id, req.categories, db + ) + + # Insert release + if req.releases: + release = req.releases[0] + release.id = req.id + release.version = req.version + release.release_date = datetime.now(timezone.utc) + await self.operator_release_repo.insert(release, db) + + # Extract files + if req.file_name: + self.parser_holder.extract_to( + self._get_file_type(req.file_name), + self._get_upload_path(req.file_name), + self._get_extract_path(self._get_stem(req.file_name)) + ) + + await db.flush() + return await self.get_operator_by_id(req.id, db) + + async def update_operator( + self, + operator_id: str, + req: OperatorUpdateDto, + db: AsyncSession + ) -> OperatorDto: + """更新算子""" + from datetime import datetime, timezone + + # Get existing operator + existing = await self.get_operator_by_id(operator_id, db) + + # Merge update request into existing operator + # Only update fields that are provided (not None) + if req.name is not None: + existing.name = req.name + if req.description is not None: + existing.description = req.description + if req.version is not None: + existing.version = req.version + if req.inputs is not None: + existing.inputs = req.inputs + if req.outputs is not None: + existing.outputs = req.outputs + if req.runtime is not None: + existing.runtime = req.runtime + if req.settings is not None: + existing.settings = req.settings + if req.file_name is not None: + existing.file_name = req.file_name + if req.file_size is not None: + existing.file_size = req.file_size + if req.metrics is not None: + existing.metrics = req.metrics + if req.usage_count is not None: + existing.usage_count = req.usage_count + if req.is_star is not None: + existing.is_star = req.is_star + if req.categories is not None: + existing.categories = req.categories + if req.overrides is not None: + existing.overrides = req.overrides + + # Override settings + self._override_settings(existing) + + # Update operator + await self.operator_repo.update(existing, db) + + # Update category relations + if req.categories is not None: + await self.category_relation_repo.batch_update( + operator_id, req.categories, db + ) + + # Update release + logger.info(f"########### {req.releases}") + if req.releases is not None and len(req.releases) > 0: + release = req.releases[0] + if release.version is None: + release.version = existing.version + release.id = operator_id + release.release_date = datetime.now(timezone.utc) + if existing.version == release.version: + await self.operator_release_repo.update(release, db) + else: + await self.operator_release_repo.insert(release, db) + + # Extract files + if req.file_name is not None: + self.parser_holder.extract_to( + self._get_file_type(req.file_name), + self._get_upload_path(req.file_name), + self._get_extract_path(self._get_stem(req.file_name)) + ) + + await db.flush() + return await self.get_operator_by_id(operator_id, db) + + async def delete_operator( + self, + operator_id: str, + db: AsyncSession + ) -> None: + """删除算子""" + # Check if operator is in use + in_template = await self.operator_repo.operator_in_template(operator_id, db) + in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db) + if in_template and in_unstop_task: + raise OperatorInInstanceError() + + # Check if operator is predefined + is_predefined = await self.category_relation_repo.operator_is_predefined( + operator_id, db + ) + if is_predefined: + raise CannotDeletePredefinedOperatorError() + + # Get operator for file cleanup + operator = await self.get_operator_by_id(operator_id, db) + + # Delete from database + await self.operator_repo.delete(operator_id, db) + await self.category_relation_repo.delete_by_operator_id(operator_id, db) + await self.operator_release_repo.delete(operator_id, db) + + # Delete extracted files + if operator.file_name: + extract_path = self._get_extract_path(self._get_stem(operator.file_name)) + shutil.rmtree(extract_path, ignore_errors=True) + + async def upload_operator( + self, + file_name: str, + db: AsyncSession + ) -> OperatorDto: + """上传算子文件并解析元数据""" + return self.parser_holder.parse_yaml_from_archive( + self._get_file_type(file_name), + self._get_upload_path(file_name), + YAML_PATH + ) + + async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: + """预上传,返回请求 ID""" + from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR + + upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR) + req_id = await self.file_service.pre_upload( + upload_path=upload_path, + service_id=SERVICE_ID, + check_info=None + ) + return {"req_id": req_id} + + async def chunk_upload( + self, + req_id: str, + file_no: int, + file_name: str, + total_chunk_num: int, + chunk_no: int, + check_sum_hex: Optional[str], + file_content: bytes, + db: AsyncSession + ) -> FileUploadResult: + """分块上传文件""" + from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR + + upload_path = os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR) + + chunk_request = ChunkUploadRequestDto( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + ) + + return await self.file_service.chunk_upload( + chunk_request, upload_path, file_content, db + ) + + def download_example_operator(self, file_path: str) -> Path: + """下载示例算子文件""" + path = Path(file_path) + if not path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + return path + + def _override_settings(self, operator: OperatorDto) -> None: + """用 overrides 值覆盖 settings 的 defaultVal""" + if not operator.settings or not operator.overrides: + return + + try: + settings = json.loads(operator.settings) + for key, value in operator.overrides.items(): + if key not in settings: + continue + + setting = settings[key] + setting_type = setting.get("type") + + match setting_type: + case "slider" | "switch" | "select" | "input" | "radio": + setting["defaultVal"] = value + case "checkbox": + setting["defaultVal"] = self._convert_to_list_string(value) + case "range": + self._update_properties(setting, value) + + settings[key] = setting + + operator.settings = json.dumps(settings) + except json.JSONDecodeError as e: + raise SettingsParseError(str(e)) + + def _convert_to_list_string(self, value: Any) -> str: + """转换为逗号分隔的字符串""" + if value is None: + return "" + if isinstance(value, list): + return ",".join(str(v) for v in value) + return str(value) + + def _update_properties(self, setting: Dict[str, Any], value: Any) -> None: + """更新 range 类型的 properties""" + if not isinstance(value, list): + return + + properties = setting.get("properties", []) + if not isinstance(properties, list) or len(properties) != len(value): + return + + for i, prop in enumerate(properties): + if isinstance(prop, dict): + prop["defaultVal"] = value[i] + + setting["properties"] = properties + + def _read_requirements(self, extract_path: str) -> List[str]: + """读取 requirements.txt""" + requirements_path = Path(extract_path) / "requirements.txt" + if not requirements_path.exists(): + return [] + + requirements = [] + try: + with open(requirements_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + requirements.append(line) + except Exception as e: + logger.warning(f"Failed to read requirements: {e}") + return requirements + + def _get_readme_content(self, extract_path: str) -> str: + """读取 README 内容""" + dir_path = Path(extract_path) + if not dir_path.exists(): + return "" + + candidates = ["README.md", "readme.md", "Readme.md"] + for filename in candidates: + readme_path = dir_path / filename + if readme_path.exists(): + try: + return readme_path.read_text(encoding='utf-8') + except Exception as e: + logger.warning(f"Failed to read README: {e}") + return "" + + def _get_file_type(self, file_name: str) -> str: + """获取文件类型(扩展名)""" + return file_name.rsplit('.', 1)[-1].lower() if '.' in file_name else "" + + def _get_stem(self, file_name: str) -> str: + """获取文件名不含扩展名""" + return file_name.rsplit('.', 1)[0] if '.' in file_name else file_name + + def _get_upload_path(self, file_name: str) -> str: + """获取上传文件路径""" + return os.path.join(OPERATOR_BASE_PATH, UPLOAD_DIR, file_name) + + def _get_extract_path(self, file_stem: str) -> str: + """获取解压路径""" + return os.path.join(OPERATOR_BASE_PATH, EXTRACT_DIR, file_stem) diff --git a/runtime/datamate-python/app/module/shared/__init__.py b/runtime/datamate-python/app/module/shared/__init__.py index e69de29b..fd0d7a1a 100644 --- a/runtime/datamate-python/app/module/shared/__init__.py +++ b/runtime/datamate-python/app/module/shared/__init__.py @@ -0,0 +1,21 @@ +""" +Shared Module Init +共享模块初始化 +""" +from .file_service import FileService +from .file_models import ( + ChunkUploadPreRequestDto, + ChunkUploadRequestDto, + FileUploadResult, +) +from .chunks_saver import ChunksSaver +from .chunk_upload_repository import ChunkUploadRepository + +__all__ = [ + "FileService", + "ChunkUploadPreRequestDto", + "ChunkUploadRequestDto", + "FileUploadResult", + "ChunksSaver", + "ChunkUploadRepository", +] diff --git a/runtime/datamate-python/app/module/shared/chunk_upload_repository.py b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py new file mode 100644 index 00000000..8a0c717d --- /dev/null +++ b/runtime/datamate-python/app/module/shared/chunk_upload_repository.py @@ -0,0 +1,95 @@ +""" +Chunk Upload Repository +分片上传数据访问层 +""" +from typing import Optional, List + +from sqlalchemy import select, update, delete +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db.models.chunk_upload import ChunkUploadPreRequest +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +class ChunkUploadRepository: + """分片上传数据访问层""" + + async def find_by_id( + self, + req_id: str, + db: AsyncSession + ) -> Optional[ChunkUploadPreRequest]: + """根据ID查询""" + result = await db.execute( + select(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id) + ) + return result.scalar_one_or_none() + + async def find_by_service_id( + self, + service_id: str, + db: AsyncSession + ) -> List[ChunkUploadPreRequest]: + """根据服务ID查询""" + result = await db.execute( + select(ChunkUploadPreRequest).where( + ChunkUploadPreRequest.service_id == service_id + ) + ) + return result.scalars().all() + + async def find_all(self, db: AsyncSession) -> List[ChunkUploadPreRequest]: + """查询所有""" + result = await db.execute(select(ChunkUploadPreRequest)) + return result.scalars().all() + + async def insert( + self, + request: ChunkUploadPreRequest, + db: AsyncSession + ) -> None: + """插入""" + db.add(request) + + async def update( + self, + request: ChunkUploadPreRequest, + db: AsyncSession + ) -> int: + """更新""" + from datetime import datetime, timezone + result = await db.execute( + update(ChunkUploadPreRequest) + .where(ChunkUploadPreRequest.id == request.id) + .values( + uploaded_file_num=request.uploaded_file_num, + timeout=request.timeout, + ) + ) + return result.rowcount + + async def delete_by_id( + self, + req_id: str, + db: AsyncSession + ) -> int: + """根据ID删除""" + result = await db.execute( + delete(ChunkUploadPreRequest).where(ChunkUploadPreRequest.id == req_id) + ) + return result.rowcount + + async def delete_by_service_id( + self, + service_id: str, + db: AsyncSession + ) -> int: + """根据服务ID删除""" + result = await db.execute( + delete(ChunkUploadPreRequest).where( + ChunkUploadPreRequest.service_id == service_id + ) + ) + return result.rowcount diff --git a/runtime/datamate-python/app/module/shared/chunks_saver.py b/runtime/datamate-python/app/module/shared/chunks_saver.py new file mode 100644 index 00000000..554b263b --- /dev/null +++ b/runtime/datamate-python/app/module/shared/chunks_saver.py @@ -0,0 +1,146 @@ +""" +Chunks Saver +分片保存器,用于处理文件分片上传 +""" +import os +from pathlib import Path +from typing import Optional +from datetime import datetime, timezone + +from fastapi import UploadFile + +from app.core.logging import get_logger +from app.module.shared.file_models import ChunkUploadRequestDto + +logger = get_logger(__name__) + + +class ChunksSaver: + """分片保存器""" + + TEMP_DIR_NAME_FORMAT = "req_%s_chunks" + + @staticmethod + def save( + file_upload_request: ChunkUploadRequestDto, + pre_upload_req_id: str, + upload_path: str, + file_content: bytes + ) -> Optional[Path]: + """ + 保存分片 + + Args: + file_upload_request: 上传分片的请求 + pre_upload_req_id: 预上传请求ID + upload_path: 上传基础路径 + file_content: 文件内容(字节) + + Returns: + 保存后的文件路径,如果不是最后一个分片则返回None + """ + start_time = datetime.now(timezone.utc) + + temp_dir = Path(upload_path) / ( + ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_upload_req_id + ) + temp_dir.mkdir(parents=True, exist_ok=True) + + temp_file = temp_dir / str(file_upload_request.file_no) + + ChunksSaver._append_to_target_file(temp_file, file_content) + + if file_upload_request.total_chunk_num != file_upload_request.chunk_no: + elapsed = (datetime.now(timezone.utc) - start_time).total_seconds() + logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s") + return None + + final_file = Path(upload_path) / file_upload_request.file_name + + try: + temp_file.rename(final_file) + except OSError as e: + logger.error( + f"failed to mv file: {temp_file.name}, req id: {pre_upload_req_id}, error: {e}" + ) + raise ValueError("failed to move file to target dir") from e + + elapsed = (datetime.now(timezone.utc) - start_time).total_seconds() + logger.debug(f"save chunk {file_upload_request.chunk_no} cost {elapsed}s") + + return final_file + + @staticmethod + def save_file( + file_upload_request: ChunkUploadRequestDto, + upload_path: str, + file_content: bytes + ) -> Path: + """ + 保存文件(不分片) + + Args: + file_upload_request: 上传请求 + upload_path: 上传路径 + file_content: 文件内容(字节) + + Returns: + 保存后的文件路径 + """ + target_file = Path(upload_path) / file_upload_request.file_name + + logger.info(f"file path {target_file}, file size {len(file_content)}") + + try: + target_file.parent.mkdir(parents=True, exist_ok=True) + target_file.write_bytes(file_content) + except OSError as e: + logger.error(f"failed to save file: {target_file}, error: {e}") + raise ValueError("failed to save file") from e + + return target_file + + @staticmethod + def delete_folder(folder_path: str) -> None: + """ + 删除指定路径下的所有文件 + + Args: + folder_path: 文件夹路径 + """ + folder = Path(folder_path) + + if not folder.exists(): + logger.info(f"folder {folder_path} does not exist") + return + + try: + for item in folder.glob("*"): + if item.is_file(): + item.unlink() + elif item.is_dir(): + for sub_item in item.glob("*"): + if sub_item.is_file(): + sub_item.unlink() + elif sub_item.is_dir(): + ChunksSaver.delete_folder(str(sub_item)) + item.rmdir() + except OSError as e: + logger.error(f"failed to delete folder: {folder_path}, error: {e}") + raise ValueError("failed to delete folder") from e + + @staticmethod + def _append_to_target_file(target_file: Path, content: bytes) -> None: + """ + 追加内容到目标文件末尾 + + Args: + target_file: 目标文件 + content: 要追加的内容 + """ + try: + with open(target_file, "ab") as f: + f.write(content) + except OSError as e: + logger.error(f"failed to append to file: {target_file}, error: {e}") + raise ValueError("failed to append content to file") from e diff --git a/runtime/datamate-python/app/module/shared/file_models.py b/runtime/datamate-python/app/module/shared/file_models.py new file mode 100644 index 00000000..c4e98775 --- /dev/null +++ b/runtime/datamate-python/app/module/shared/file_models.py @@ -0,0 +1,38 @@ +""" +File Models +文件相关模型定义 +""" +from pathlib import Path +from typing import Optional +from pydantic import BaseModel, Field +from datetime import datetime + + +class ChunkUploadPreRequestDto(BaseModel): + """分片上传预请求DTO""" + id: str = Field(..., description="请求ID") + total_file_num: int = Field(..., description="总文件数", ge=1) + uploaded_file_num: Optional[int] = Field(None, description="已上传文件数", ge=0) + upload_path: str = Field(..., description="文件路径") + timeout: Optional[datetime] = Field(None, description="上传请求超时时间") + service_id: Optional[str] = Field(None, description="上传请求所属服务ID") + check_info: Optional[str] = Field(None, description="业务信息") + + +class ChunkUploadRequestDto(BaseModel): + """分片上传请求DTO""" + req_id: str = Field(..., description="预上传返回的ID") + file_no: int = Field(1, description="文件编号", ge=1) + file_name: str = Field(..., description="文件名称") + total_chunk_num: int = Field(1, description="总分块数量", ge=1) + chunk_no: int = Field(1, description="当前分块编号", ge=1) + file_size: Optional[int] = Field(None, description="文件大小", ge=0) + check_sum_hex: Optional[str] = Field(None, description="文件校验和(十六进制字符串)") + + +class FileUploadResult(BaseModel): + """文件上传结果""" + is_all_files_uploaded: bool = Field(..., description="是否所有文件已上传") + check_info: Optional[str] = Field(None, description="业务上传信息") + saved_file_path: Optional[str] = Field(None, description="保存的文件路径") + file_name: str = Field(..., description="文件名称") diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py new file mode 100644 index 00000000..1c859c85 --- /dev/null +++ b/runtime/datamate-python/app/module/shared/file_service.py @@ -0,0 +1,187 @@ +""" +File Service +文件服务,处理文件上传、分片上传等功能 +""" +import os +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +from fastapi import UploadFile + +from app.core.logging import get_logger +from app.module.shared.file_models import ( + ChunkUploadPreRequestDto, + ChunkUploadRequestDto, + FileUploadResult, +) +from app.module.shared.chunks_saver import ChunksSaver +from app.module.shared.chunk_upload_repository import ChunkUploadRepository +from app.db.models.chunk_upload import ChunkUploadPreRequest + +logger = get_logger(__name__) + + +class FileService: + """文件服务""" + + DEFAULT_TIMEOUT_SECONDS = 120 + + def __init__( + self, + chunk_upload_repo: ChunkUploadRepository, + ): + self.chunk_upload_repo = chunk_upload_repo + + async def pre_upload( + self, + upload_path: str, + service_id: str, + check_info: Optional[str] = None + ) -> str: + """ + 预上传 + + Args: + upload_path: 上传路径 + service_id: 服务ID + check_info: 业务信息 + + Returns: + 预上传请求ID + """ + req_id = str(uuid.uuid4()) + timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + + pre_request = ChunkUploadPreRequest( + id=req_id, + total_file_num=1, + uploaded_file_num=0, + upload_path=upload_path, + timeout=timeout, + service_id=service_id, + check_info=check_info, + ) + + await self.chunk_upload_repo.insert(pre_request) + return req_id + + async def chunk_upload( + self, + upload_request: ChunkUploadRequestDto, + upload_path: str, + file_content: bytes, + db_session, + ) -> FileUploadResult: + """ + 分片上传 + + Args: + upload_request: 上传请求 + upload_path: 上传路径 + file_content: 文件内容 + db_session: 数据库会话 + + Returns: + 上传结果 + """ + upload_request.file_size = len(file_content) + + pre_request = await self.chunk_upload_repo.find_by_id( + upload_request.req_id, db_session + ) + + if pre_request is None: + logger.error(f"pre-upload request not found: {upload_request.req_id}") + raise ValueError("Pre-upload request not found") + + if pre_request.is_upload_complete(): + logger.error(f"upload already complete: {upload_request.req_id}") + raise ValueError("Upload already complete") + + if pre_request.is_request_timeout(): + logger.error(f"upload request timeout: {upload_request.req_id}") + raise ValueError("Upload request timeout") + + saved_file_path = None + + if upload_request.total_chunk_num > 1: + saved_file_path = await self._upload_chunk( + upload_request, pre_request, upload_path, file_content + ) + else: + saved_file_path = await self._upload_file( + upload_request, pre_request, upload_path, file_content + ) + + update_count = await self.chunk_upload_repo.update(pre_request, db_session) + + if update_count == 0: + logger.error(f"failed to update pre-request: {upload_request.req_id}") + raise ValueError("Failed to update pre-upload request") + + is_finish = pre_request.uploaded_file_num == pre_request.total_file_num + + if is_finish: + temp_dir = os.path.join( + upload_path, + ChunksSaver.TEMP_DIR_NAME_FORMAT % pre_request.id + ) + try: + ChunksSaver.delete_folder(temp_dir) + except Exception as e: + logger.warning(f"failed to delete temp dir: {temp_dir}, error: {e}") + + await self.chunk_upload_repo.delete_by_id(pre_request.id, db_session) + + return FileUploadResult( + is_all_files_uploaded=is_finish, + check_info=pre_request.check_info, + saved_file_path=str(saved_file_path) if saved_file_path else None, + file_name=upload_request.file_name, + ) + + async def _upload_file( + self, + upload_request: ChunkUploadRequestDto, + pre_request: ChunkUploadPreRequest, + upload_path: str, + file_content: bytes + ) -> Path: + """上传单文件""" + saved_file = ChunksSaver.save_file( + upload_request, upload_path, file_content + ) + + from datetime import timezone + pre_request.timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + pre_request.increment_uploaded_file_num() + + return saved_file + + async def _upload_chunk( + self, + upload_request: ChunkUploadRequestDto, + pre_request: ChunkUploadPreRequest, + upload_path: str, + file_content: bytes + ) -> Optional[Path]: + """上传分片""" + saved_file = ChunksSaver.save( + upload_request, pre_request.id, upload_path, file_content + ) + + if saved_file is not None: + pre_request.increment_uploaded_file_num() + return saved_file + + from datetime import timezone + pre_request.timeout = datetime.now(timezone.utc).replace( + microsecond=0 + ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + return None diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 0587b841..c85380a0 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -49,6 +49,10 @@ CREATE TABLE IF NOT EXISTS t_operator_release version VARCHAR(255), release_date TIMESTAMP, changelog JSON, + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id, version) ); @@ -60,7 +64,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category value VARCHAR(64) UNIQUE, type VARCHAR(64), parent_id VARCHAR(64), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); COMMENT ON TABLE t_operator_category IS '算子分类表'; @@ -76,6 +83,10 @@ CREATE TABLE IF NOT EXISTS t_operator_category_relation ( category_id VARCHAR(64), operator_id VARCHAR(64), + created_by VARCHAR(255), + updated_by VARCHAR(255), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (category_id, operator_id) ); diff --git a/scripts/images/backend-python/Dockerfile b/scripts/images/backend-python/Dockerfile index 4d276dd0..826a0531 100644 --- a/scripts/images/backend-python/Dockerfile +++ b/scripts/images/backend-python/Dockerfile @@ -55,6 +55,7 @@ ENV NLTK_DATA=/usr/local/nltk_data # Copy the rest of the application COPY runtime/datamate-python /app +COPY runtime/ops/examples/test_operator/test_operator.tar /app/test_operator.tar COPY --from=datax-builder /DataX/target/datax/datax /opt/datax RUN cp /opt/datax/plugin/reader/mysqlreader/libs/mysql* /opt/datax/plugin/reader/starrocksreader/libs/ From fd1e4506e778b6dafbe99b1d8090ff23527027e1 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 4 Feb 2026 10:11:50 +0800 Subject: [PATCH 10/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/module/operator/service/operator_service.py | 1 + .../app/module/shared/file_service.py | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 17127e58..094c49f5 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -458,6 +458,7 @@ async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: req_id = await self.file_service.pre_upload( upload_path=upload_path, service_id=SERVICE_ID, + db_session=db, check_info=None ) return {"req_id": req_id} diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py index 1c859c85..e51db024 100644 --- a/runtime/datamate-python/app/module/shared/file_service.py +++ b/runtime/datamate-python/app/module/shared/file_service.py @@ -4,7 +4,7 @@ """ import os import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Optional @@ -38,6 +38,7 @@ async def pre_upload( self, upload_path: str, service_id: str, + db_session, check_info: Optional[str] = None ) -> str: """ @@ -54,7 +55,7 @@ async def pre_upload( req_id = str(uuid.uuid4()) timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) pre_request = ChunkUploadPreRequest( id=req_id, @@ -66,7 +67,7 @@ async def pre_upload( check_info=check_info, ) - await self.chunk_upload_repo.insert(pre_request) + await self.chunk_upload_repo.insert(pre_request, db_session) return req_id async def chunk_upload( @@ -156,10 +157,9 @@ async def _upload_file( upload_request, upload_path, file_content ) - from datetime import timezone pre_request.timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) pre_request.increment_uploaded_file_num() return saved_file @@ -180,8 +180,7 @@ async def _upload_chunk( pre_request.increment_uploaded_file_num() return saved_file - from datetime import timezone pre_request.timeout = datetime.now(timezone.utc).replace( microsecond=0 - ) + timezone.timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) + ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) return None From ce925a4fca33226a17ac8ce9262c42361c9db7f0 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 4 Feb 2026 15:11:54 +0800 Subject: [PATCH 11/20] =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=B8=82=E5=9C=BApytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deployment/docker/datamate/docker-compose.yml | 2 + .../Detail/components/ChangeLog.tsx | 2 +- .../pages/OperatorMarket/operator.const.tsx | 10 +-- .../app/db/models/chunk_upload.py | 8 +-- .../operator/interface/operator_routes.py | 67 ++++++++++--------- .../operator/parsers/abstract_parser.py | 31 +++++++-- .../module/operator/parsers/parser_holder.py | 13 +++- .../app/module/operator/parsers/tar_parser.py | 10 ++- .../app/module/operator/parsers/zip_parser.py | 11 ++- .../category_relation_repository.py | 16 ++--- .../app/module/operator/schema/operator.py | 2 +- .../operator/service/operator_service.py | 65 ++++++++++++------ .../app/module/shared/file_service.py | 17 ++--- scripts/db/data-operator-init.sql | 3 - 14 files changed, 161 insertions(+), 96 deletions(-) diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml index 9f3ec006..eb49a0ce 100644 --- a/deployment/docker/datamate/docker-compose.yml +++ b/deployment/docker/datamate/docker-compose.yml @@ -35,6 +35,8 @@ services: - flow_volume:/flow - log_volume:/var/log/datamate - graph_data_volume:/data/rag_storage + - operator-upload-volume:/operators/upload + - operator-runtime-volume:/operators/extract networks: [ datamate ] depends_on: - datamate-database diff --git a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx index 5484a9e1..55c3891b 100644 --- a/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx +++ b/frontend/src/pages/OperatorMarket/Detail/components/ChangeLog.tsx @@ -22,7 +22,7 @@ export default function ChangeLog({ operator }) { )} - {release.changelog.map((change, changeIndex) => ( + {release.changelog?.map((change, changeIndex) => (
{change} diff --git a/frontend/src/pages/OperatorMarket/operator.const.tsx b/frontend/src/pages/OperatorMarket/operator.const.tsx index 1104a118..75d99cb0 100644 --- a/frontend/src/pages/OperatorMarket/operator.const.tsx +++ b/frontend/src/pages/OperatorMarket/operator.const.tsx @@ -148,10 +148,10 @@ export const mapOperator = (op: OperatorI, t: (key: string) => string) => { label: t("operatorMarket.const.language"), value: "Python", }, - { - label: t("operatorMarket.const.function"), - value: functionLabel, - }, + // { + // label: t("operatorMarket.const.function"), + // value: functionLabel, + // }, ], }; }; @@ -198,4 +198,4 @@ export const formatBytes = (bytes: number | null | undefined, decimals: number = // 4. 格式化数值并拼接单位 // parseFloat 用于去掉末尾多余的 0 (例如 "1.20 MB" -> "1.2 MB") return `${parseFloat((bytes / Math.pow(k, i)).toFixed(dm))} ${sizes[i]}`; -}; \ No newline at end of file +}; diff --git a/runtime/datamate-python/app/db/models/chunk_upload.py b/runtime/datamate-python/app/db/models/chunk_upload.py index 5b5a2b0c..e110af98 100644 --- a/runtime/datamate-python/app/db/models/chunk_upload.py +++ b/runtime/datamate-python/app/db/models/chunk_upload.py @@ -5,10 +5,10 @@ from sqlalchemy import Column, String, Integer, DateTime from sqlalchemy.sql import func -from app.db.models.base_entity import Base, BaseEntity +from app.db.models.base_entity import Base -class ChunkUploadPreRequest(BaseEntity): +class ChunkUploadPreRequest(Base): """分片上传预请求""" __tablename__ = "t_chunk_upload_request" @@ -34,5 +34,5 @@ def is_upload_complete(self) -> bool: def is_request_timeout(self) -> bool: """检查是否已超时""" - from datetime import datetime, timezone - return self.timeout is not None and datetime.now(timezone.utc) > self.timeout + from datetime import datetime + return self.timeout is not None and datetime.utcnow() > self.timeout diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py index 8a1911d2..ee3b9c78 100644 --- a/runtime/datamate-python/app/module/operator/interface/operator_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -2,32 +2,29 @@ Operator API Routes 算子 API 路由 """ -from pathlib import Path -from typing import List, Optional +from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form +from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form, File, Body from fastapi.responses import FileResponse +from app.core.logging import get_logger +from app.db.models.operator import Operator, CategoryRelation, OperatorRelease from app.db.session import get_db -from app.module.shared.schema import StandardResponse, PaginatedData +from app.module.operator.parsers import ParserHolder +from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, +) from app.module.operator.schema import ( OperatorDto, OperatorUpdateDto, OperatorListRequest, - PreUploadResponse, ) from app.module.operator.service import OperatorService -from app.module.operator.repository import ( - OperatorRepository, - CategoryRelationRepository, - OperatorReleaseRepository, -) -from app.module.operator.parsers import ParserHolder -from app.db.models.operator import Operator, CategoryRelation, OperatorRelease -from app.core.logging import get_logger -from app.module.shared.file_service import FileService from app.module.shared.chunk_upload_repository import ChunkUploadRepository -from app.db.models.chunk_upload import ChunkUploadPreRequest +from app.module.shared.file_service import FileService +from app.module.shared.schema import StandardResponse, PaginatedData logger = get_logger(__name__) @@ -102,6 +99,7 @@ async def get_operator( """获取算子详情""" try: operator = await service.get_operator_by_id(operator_id, db) + operator.file_name = None # Don't return file_name return StandardResponse(code=200, message="success", data=operator) except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) @@ -158,21 +156,25 @@ async def create_operator( description="上传算子文件并解析元数据" ) async def upload_operator( - file_name: str, + request: dict = Body(...), service: OperatorService = Depends(get_operator_service), db=Depends(get_db) ): """上传算子""" try: + file_name = request.get("fileName") + if not file_name: + raise HTTPException(status_code=422, detail="fileName is required") operator = await service.upload_operator(file_name, db) return StandardResponse(code=200, message="success", data=operator) except Exception as e: + logger.error(f"{file_name}", e) raise HTTPException(status_code=400, detail=str(e)) @router.post( "/upload/pre-upload", - response_model=StandardResponse[PreUploadResponse], + response_model=StandardResponse[str], summary="预上传", description="获取预上传 ID,用于分块上传" ) @@ -181,12 +183,17 @@ async def pre_upload( db=Depends(get_db) ): """预上传""" - result = await service.pre_upload(db) - return StandardResponse( - code=200, - message="success", - data=PreUploadResponse(req_id=result["req_id"]) - ) + try: + req_id = await service.pre_upload(db) + await db.commit() + return StandardResponse( + code=200, + message="success", + data=req_id, + ) + except Exception as e: + await db.rollback() + raise HTTPException(status_code=400, detail=str(e)) @router.post( @@ -196,13 +203,13 @@ async def pre_upload( description="分块上传算子文件" ) async def chunk_upload( - req_id: str = Form(..., description="预上传ID"), - file_no: int = Form(1, description="文件编号"), - file_name: str = Form(..., description="文件名"), - total_chunk_num: int = Form(1, description="总分块数"), - chunk_no: int = Form(1, description="当前分块号"), - file: UploadFile = ..., - check_sum_hex: Optional[str] = Form(None, description="校验和"), + req_id: str = Form(..., alias="reqId", description="预上传ID"), + file_no: int = Form(1, alias="fileNo", description="文件编号"), + file_name: str = Form(..., alias="fileName", description="文件名"), + total_chunk_num: int = Form(1, alias="totalChunkNum", description="总分块数"), + chunk_no: int = Form(1, alias="chunkNo", description="当前分块号"), + file: UploadFile = File(...), + check_sum_hex: Optional[str] = Form(None, alias="checkSumHex", description="校验和"), service: OperatorService = Depends(get_operator_service), db=Depends(get_db) ): diff --git a/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py index 27e9aa3c..50ee98cf 100644 --- a/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py +++ b/runtime/datamate-python/app/module/operator/parsers/abstract_parser.py @@ -16,7 +16,13 @@ class AbstractParser(ABC): """算子文件解析器抽象基类""" @abstractmethod - def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + def parse_yaml_from_archive( + self, + archive_path: str, + entry_path: str, + file_name: Optional[str] = None, + file_size: Optional[int] = None + ) -> OperatorDto: """ 从压缩包内读取指定路径的 yaml 文件并解析为 OperatorDto @@ -40,7 +46,12 @@ def extract_to(self, archive_path: str, target_dir: str) -> None: """ pass - def parse_yaml(self, yaml_content: str) -> OperatorDto: + def parse_yaml( + self, + yaml_content: str, + file_name: Optional[str] = None, + file_size: Optional[int] = None + ) -> OperatorDto: """解析 YAML 内容为 OperatorDto""" content: Dict[str, Any] = yaml.safe_load(yaml_content) @@ -54,14 +65,24 @@ def parse_yaml(self, yaml_content: str) -> OperatorDto: runtime=self._to_json(content.get("runtime")), settings=self._to_json(content.get("settings")), metrics=self._to_json(content.get("metrics")), + file_name=file_name, + file_size=file_size, ) # Handle changelog changelog = content.get("release") if isinstance(changelog, list): - operator_release = OperatorReleaseDto(changelog=changelog) + operator_release = OperatorReleaseDto( + id=operator.id, + version=operator.version, + changelog=changelog + ) else: - operator_release = OperatorReleaseDto(changelog=[]) + operator_release = OperatorReleaseDto( + id=operator.id, + version=operator.version, + changelog=[] + ) operator.releases = [operator_release] # Build categories @@ -92,6 +113,6 @@ def _to_json(self, obj: Any) -> Optional[str]: if obj is None: return None try: - return json.dumps(obj) + return json.dumps(obj).strip('"').strip("'") except (TypeError, ValueError) as e: raise ValueError(f"Failed to serialize to JSON: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/parser_holder.py b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py index e4a79d63..83522df4 100644 --- a/runtime/datamate-python/app/module/operator/parsers/parser_holder.py +++ b/runtime/datamate-python/app/module/operator/parsers/parser_holder.py @@ -3,7 +3,7 @@ 解析器持有者,根据文件类型选择合适的解析器 """ import os -from typing import Dict, Type +from typing import Dict, Type, Optional from app.module.operator.parsers.abstract_parser import AbstractParser from app.module.operator.parsers.tar_parser import TarParser @@ -36,13 +36,20 @@ def parse_yaml_from_archive( self, file_type: str, archive_path: str, - entry_path: str + entry_path: str, + file_name: Optional[str] = None, + file_size: Optional[int] = None ) -> OperatorDto: """从压缩包解析 YAML""" if file_type not in self._parsers: raise ValueError(f"Unsupported file type: {file_type}") - return self._parsers[file_type].parse_yaml_from_archive(archive_path, entry_path) + return self._parsers[file_type].parse_yaml_from_archive( + archive_path, + entry_path, + file_name, + file_size + ) def extract_to(self, file_type: str, archive_path: str, target_dir: str) -> None: """解压文件到目标目录""" diff --git a/runtime/datamate-python/app/module/operator/parsers/tar_parser.py b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py index e2618cfa..9ce87f88 100644 --- a/runtime/datamate-python/app/module/operator/parsers/tar_parser.py +++ b/runtime/datamate-python/app/module/operator/parsers/tar_parser.py @@ -13,7 +13,13 @@ class TarParser(AbstractParser): """TAR 压缩包解析器""" - def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + def parse_yaml_from_archive( + self, + archive_path: str, + entry_path: str, + file_name: Optional[str] = None, + file_size: Optional[int] = None + ) -> OperatorDto: """从 TAR 文件中解析 YAML""" try: with tarfile.open(archive_path, 'r:*') as tar: @@ -22,7 +28,7 @@ def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> Operato file = tar.extractfile(member) if file: content = file.read().decode('utf-8') - return self.parse_yaml(content) + return self.parse_yaml(content, file_name, file_size) raise FileNotFoundError(f"File '{entry_path}' not found in archive") except (tarfile.TarError, EOFError) as e: raise ValueError(f"Failed to parse TAR file: {e}") diff --git a/runtime/datamate-python/app/module/operator/parsers/zip_parser.py b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py index a1741efe..db4a1b73 100644 --- a/runtime/datamate-python/app/module/operator/parsers/zip_parser.py +++ b/runtime/datamate-python/app/module/operator/parsers/zip_parser.py @@ -13,16 +13,21 @@ class ZipParser(AbstractParser): """ZIP 压缩包解析器""" - def parse_yaml_from_archive(self, archive_path: str, entry_path: str) -> OperatorDto: + def parse_yaml_from_archive( + self, + archive_path: str, + entry_path: str, + file_name: Optional[str] = None, + file_size: Optional[int] = None + ) -> OperatorDto: """从 ZIP 文件中解析 YAML""" try: with zipfile.ZipFile(archive_path, 'r') as zf: - # Check all possible paths for name in zf.namelist(): if name == entry_path or name.endswith(f"/{entry_path}"): with zf.open(name) as file: content = file.read().decode('utf-8') - return self.parse_yaml(content) + return self.parse_yaml(content, file_name, file_size) raise FileNotFoundError(f"File '{entry_path}' not found in archive") except (zipfile.BadZipFile, zipfile.LargeZipFile) as e: raise ValueError(f"Failed to parse ZIP file: {e}") diff --git a/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py index 1edd5868..b7de1e99 100644 --- a/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py +++ b/runtime/datamate-python/app/module/operator/repository/category_relation_repository.py @@ -19,7 +19,7 @@ def __init__(self, model: CategoryRelation): async def find_all(self, db: AsyncSession) -> List[CategoryRelation]: """查询所有分类关系""" - result = await db.execute(select(self.model)) + result = await db.execute(select(CategoryRelation)) return result.scalars().all() async def batch_insert( @@ -45,8 +45,8 @@ async def batch_update( """批量更新分类关系(先删除后插入)""" # Delete existing relations await db.execute( - delete(self.model) - .where(self.model.operator_id == operator_id) + delete(CategoryRelation) + .where(CategoryRelation.operator_id == operator_id) ) # Insert new relations for category_id in category_ids: @@ -59,18 +59,18 @@ async def batch_update( async def delete_by_operator_id(self, operator_id: str, db: AsyncSession) -> None: """根据算子ID删除分类关系""" await db.execute( - delete(self.model) - .where(self.model.operator_id == operator_id) + delete(CategoryRelation) + .where(CategoryRelation.operator_id == operator_id) ) async def operator_is_predefined(self, operator_id: str, db: AsyncSession) -> bool: """检查算子是否为预定义算子""" result = await db.execute( - select(self.model) + select(CategoryRelation) .where( and_( - self.model.operator_id == operator_id, - self.model.category_id == CATEGORY_PREDEFINED_ID + CategoryRelation.operator_id == operator_id, + CategoryRelation.category_id == CATEGORY_PREDEFINED_ID ) ) ) diff --git a/runtime/datamate-python/app/module/operator/schema/operator.py b/runtime/datamate-python/app/module/operator/schema/operator.py index c53ed864..f0868542 100644 --- a/runtime/datamate-python/app/module/operator/schema/operator.py +++ b/runtime/datamate-python/app/module/operator/schema/operator.py @@ -39,7 +39,7 @@ class OperatorDto(BaseResponseModel): class OperatorListRequest(BaseResponseModel): """算子列表查询请求""" page: int = Field(1, ge=0, description="页码(从0开始)") - size: int = Field(10, ge=1, le=100, description="页大小") + size: int = Field(10, ge=1, description="页大小") categories: List[List[str]] = Field(default_factory=list, description="分类ID列表(每个父分类下的id放到一个列表中)") keyword: Optional[str] = Field(None, description="搜索关键词") label_name: Optional[str] = Field(None, description="标签名称(暂不支持)") diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 094c49f5..3ba4fa84 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -231,7 +231,18 @@ async def get_operator_by_id( ) -> OperatorDto: """根据 ID 获取算子详情""" result = await db.execute( - text("SELECT * FROM v_operator WHERE operator_id = :operator_id"), + text(""" + SELECT + operator_id, operator_name, description, version, inputs, outputs, runtime, + settings, is_star, file_name, file_size, usage_count, metrics, + created_at, updated_at, created_by, updated_by, + STRING_AGG(category_name, ',' ORDER BY created_at DESC) AS categories + FROM v_operator + WHERE operator_id = :operator_id + GROUP BY operator_id, operator_name, description, version, inputs, outputs, runtime, + settings, is_star, file_name, file_size, usage_count, metrics, + created_at, updated_at, created_by, updated_by + """), {"operator_id": operator_id} ) row = result.fetchone() @@ -239,6 +250,10 @@ async def get_operator_by_id( if not row: raise ValueError(f"Operator {operator_id} not found") + # Parse categories from comma-separated string + categories_str = row.categories if hasattr(row, 'categories') and row.categories else "" + categories = [c.strip() for c in categories_str.split(",")] if categories_str else [] + # Build DTO operator = OperatorDto( id=row.operator_id, @@ -256,6 +271,7 @@ async def get_operator_by_id( is_star=row.is_star, created_at=row.created_at, updated_at=row.updated_at, + categories=categories, ) # Read requirements and readme if file exists @@ -266,8 +282,6 @@ async def get_operator_by_id( operator.requirements = self._read_requirements(extract_path) operator.readme = self._get_readme_content(extract_path) - operator.file_name = None # Don't return file_name - # Load releases releases = await self.operator_release_repo.find_all_by_operator_id( operator_id, db @@ -301,6 +315,7 @@ async def create_operator( # Insert operator await self.operator_repo.insert(req, db) + await db.flush() # Insert category relations if req.categories: @@ -313,7 +328,7 @@ async def create_operator( release = req.releases[0] release.id = req.id release.version = req.version - release.release_date = datetime.now(timezone.utc) + release.release_date = datetime.utcnow() await self.operator_release_repo.insert(release, db) # Extract files @@ -324,8 +339,7 @@ async def create_operator( self._get_extract_path(self._get_stem(req.file_name)) ) - await db.flush() - return await self.get_operator_by_id(req.id, db) + return req async def update_operator( self, @@ -339,6 +353,9 @@ async def update_operator( # Get existing operator existing = await self.get_operator_by_id(operator_id, db) + # Save original version for release comparison + original_version = existing.version + # Merge update request into existing operator # Only update fields that are provided (not None) if req.name is not None: @@ -377,20 +394,18 @@ async def update_operator( await self.operator_repo.update(existing, db) # Update category relations - if req.categories is not None: + if req.file_name is not None and req.categories is not None: await self.category_relation_repo.batch_update( operator_id, req.categories, db ) # Update release - logger.info(f"########### {req.releases}") if req.releases is not None and len(req.releases) > 0: release = req.releases[0] - if release.version is None: - release.version = existing.version release.id = operator_id - release.release_date = datetime.now(timezone.utc) - if existing.version == release.version: + release.version = req.version + release.release_date = datetime.utcnow() + if original_version == release.version: await self.operator_release_repo.update(release, db) else: await self.operator_release_repo.insert(release, db) @@ -415,7 +430,7 @@ async def delete_operator( # Check if operator is in use in_template = await self.operator_repo.operator_in_template(operator_id, db) in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db) - if in_template and in_unstop_task: + if in_template or in_unstop_task: raise OperatorInInstanceError() # Check if operator is predefined @@ -444,13 +459,17 @@ async def upload_operator( db: AsyncSession ) -> OperatorDto: """上传算子文件并解析元数据""" + file_path = self._get_upload_path(file_name) + file_size = os.path.getsize(file_path) if os.path.exists(file_path) else None return self.parser_holder.parse_yaml_from_archive( self._get_file_type(file_name), - self._get_upload_path(file_name), - YAML_PATH + file_path, + YAML_PATH, + file_name, + file_size ) - async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: + async def pre_upload(self, db: AsyncSession) -> str: """预上传,返回请求 ID""" from app.module.operator.constants import OPERATOR_BASE_PATH, UPLOAD_DIR @@ -461,7 +480,7 @@ async def pre_upload(self, db: AsyncSession) -> Dict[str, str]: db_session=db, check_info=None ) - return {"req_id": req_id} + return req_id async def chunk_upload( self, @@ -570,17 +589,21 @@ def _read_requirements(self, extract_path: str) -> List[str]: def _get_readme_content(self, extract_path: str) -> str: """读取 README 内容""" dir_path = Path(extract_path) - if not dir_path.exists(): + if not dir_path.exists() or not dir_path.is_dir(): + logger.info(f"Directory does not exist or is not a directory: {extract_path}") return "" candidates = ["README.md", "readme.md", "Readme.md"] for filename in candidates: readme_path = dir_path / filename - if readme_path.exists(): + if readme_path.exists() and readme_path.is_file(): try: - return readme_path.read_text(encoding='utf-8') + content = readme_path.read_text(encoding='utf-8') + logger.info(f"Successfully read README from: {readme_path}") + return content except Exception as e: - logger.warning(f"Failed to read README: {e}") + logger.warning(f"Failed to read README from {readme_path}: {e}") + logger.info(f"No README found in: {extract_path}") return "" def _get_file_type(self, file_name: str) -> str: diff --git a/runtime/datamate-python/app/module/shared/file_service.py b/runtime/datamate-python/app/module/shared/file_service.py index e51db024..1a858587 100644 --- a/runtime/datamate-python/app/module/shared/file_service.py +++ b/runtime/datamate-python/app/module/shared/file_service.py @@ -4,21 +4,18 @@ """ import os import uuid -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from pathlib import Path from typing import Optional -from fastapi import UploadFile - from app.core.logging import get_logger +from app.db.models.chunk_upload import ChunkUploadPreRequest +from app.module.shared.chunk_upload_repository import ChunkUploadRepository +from app.module.shared.chunks_saver import ChunksSaver from app.module.shared.file_models import ( - ChunkUploadPreRequestDto, ChunkUploadRequestDto, FileUploadResult, ) -from app.module.shared.chunks_saver import ChunksSaver -from app.module.shared.chunk_upload_repository import ChunkUploadRepository -from app.db.models.chunk_upload import ChunkUploadPreRequest logger = get_logger(__name__) @@ -53,7 +50,7 @@ async def pre_upload( 预上传请求ID """ req_id = str(uuid.uuid4()) - timeout = datetime.now(timezone.utc).replace( + timeout = datetime.utcnow().replace( microsecond=0 ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) @@ -157,7 +154,7 @@ async def _upload_file( upload_request, upload_path, file_content ) - pre_request.timeout = datetime.now(timezone.utc).replace( + pre_request.timeout = datetime.utcnow().replace( microsecond=0 ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) pre_request.increment_uploaded_file_num() @@ -180,7 +177,7 @@ async def _upload_chunk( pre_request.increment_uploaded_file_num() return saved_file - pre_request.timeout = datetime.now(timezone.utc).replace( + pre_request.timeout = datetime.utcnow().replace( microsecond=0 ) + timedelta(seconds=self.DEFAULT_TIMEOUT_SECONDS) return None diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index c85380a0..e6650e4c 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -218,9 +218,6 @@ VALUES ('ObjectDetectionRectangle', '图像目标检测与预标注', '基于 YOLOv8 的图像目标检测算子。对输入图像进行目标检测,输出带矩形框与类别标签的标注图像,并生成结构化标注 JSON(包含类别、置信度与边界框坐标)。支持将检测结果导出为 Label Studio 兼容的 predictions 预标注格式(rectanglelabels),可在标注任务中直接加载并进行人工校正,从而显著降低人工标注成本并提升标注效率。', '1.0.0', 'image', 'image,json', null, null, '', 12288, false, 'system', 'system') ON CONFLICT DO NOTHING; -INSERT INTO t_operator_release(id, version, release_date, changelog) -VALUES ('MineruFormatter', '1.0.0', '2026-03-30', '["aaa","bbb"]'); - INSERT INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id FROM t_operator_category c From ec574acb6f809f4df60bb7fc5a20c94cfbf581bc Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Thu, 5 Feb 2026 11:41:22 +0800 Subject: [PATCH 12/20] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86pytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 4 +- .../gateway/ApiGatewayApplication.java | 3 + deployment/helm/datamate/values.yaml | 3 + .../Create/components/ParamConfig.tsx | 7 +- frontend/vite.config.ts | 59 +-- .../datamate-python/app/db/models/cleaning.py | 59 +++ .../datamate-python/app/module/__init__.py | 2 + .../app/module/cleaning/__init__.py | 71 +++ .../app/module/cleaning/exceptions.py | 57 +++ .../app/module/cleaning/interface/__init__.py | 8 + .../interface/cleaning_task_routes.py | 259 +++++++++++ .../interface/cleaning_template_routes.py | 198 +++++++++ .../module/cleaning/repository/__init__.py | 11 + .../repository/cleaning_result_repository.py | 75 ++++ .../repository/cleaning_task_repository.py | 134 ++++++ .../cleaning_template_repository.py | 57 +++ .../operator_instance_repository.py | 56 +++ .../app/module/cleaning/runtime_client.py | 61 +++ .../app/module/cleaning/schema/__init__.py | 25 ++ .../app/module/cleaning/schema/cleaning.py | 138 ++++++ .../app/module/cleaning/service/__init__.py | 11 + .../cleaning/service/clean_task_validator.py | 61 +++ .../service/cleaning_task_scheduler.py | 41 ++ .../cleaning/service/cleaning_task_service.py | 408 ++++++++++++++++++ .../service/cleaning_template_service.py | 224 ++++++++++ .../app/module/dataset/schema/__init__.py | 2 + .../app/module/dataset/schema/dataset.py | 11 + .../app/module/dataset/service/service.py | 78 ++++ .../operator/service/operator_service.py | 10 +- .../ops/examples/test_operator/metadata.yml | 4 +- .../examples/test_operator/test_operator.tar | Bin 5077 -> 27648 bytes scripts/db/data-cleaning-init.sql | 10 +- scripts/images/backend-python/Dockerfile | 3 +- 33 files changed, 2092 insertions(+), 58 deletions(-) create mode 100644 runtime/datamate-python/app/db/models/cleaning.py create mode 100644 runtime/datamate-python/app/module/cleaning/__init__.py create mode 100644 runtime/datamate-python/app/module/cleaning/exceptions.py create mode 100644 runtime/datamate-python/app/module/cleaning/interface/__init__.py create mode 100644 runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py create mode 100644 runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py create mode 100644 runtime/datamate-python/app/module/cleaning/repository/__init__.py create mode 100644 runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py create mode 100644 runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py create mode 100644 runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py create mode 100644 runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py create mode 100644 runtime/datamate-python/app/module/cleaning/runtime_client.py create mode 100644 runtime/datamate-python/app/module/cleaning/schema/__init__.py create mode 100644 runtime/datamate-python/app/module/cleaning/schema/cleaning.py create mode 100644 runtime/datamate-python/app/module/cleaning/service/__init__.py create mode 100644 runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py create mode 100644 runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py create mode 100644 runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py create mode 100644 runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py diff --git a/Makefile b/Makefile index 830a2246..859cd0db 100644 --- a/Makefile +++ b/Makefile @@ -231,8 +231,8 @@ else echo -n "Enter choice (default: 2): "; \ read DELETE_VOLUMES_CHOICE; \ export DELETE_VOLUMES_CHOICE; \ - fi - @$(MAKE) label-studio-$(INSTALLER)-uninstall DELETE_VOLUMES_CHOICE=$$DELETE_VOLUMES_CHOICE; \ + fi; \ + $(MAKE) label-studio-$(INSTALLER)-uninstall DELETE_VOLUMES_CHOICE=$$DELETE_VOLUMES_CHOICE; \ $(MAKE) milvus-$(INSTALLER)-uninstall DELETE_VOLUMES_CHOICE=$$DELETE_VOLUMES_CHOICE; \ $(MAKE) deer-flow-$(INSTALLER)-uninstall DELETE_VOLUMES_CHOICE=$$DELETE_VOLUMES_CHOICE; \ $(MAKE) datamate-$(INSTALLER)-uninstall DELETE_VOLUMES_CHOICE=$$DELETE_VOLUMES_CHOICE diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java index de9e1f28..ba163782 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java @@ -49,6 +49,9 @@ public RouteLocator customRouteLocator(RouteLocatorBuilder builder) { .route("data-operator", r -> r.path("/api/operators/**", "api/categories/**") .uri("http://datamate-backend-python:18000")) + .route("data-cleaning", r -> r.path("/api/cleaning/**") + .uri("http://datamate-backend-python:18000")) + .route("deer-flow-frontend", r -> r.path("/chat/**") .uri("http://deer-flow-frontend:3000")) diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index 9ba25700..1044b809 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -143,6 +143,7 @@ backend-python: - *datasetVolume - *flowVolume - *logVolume + - *operatorVolume volumeMounts: - name: dataset-volume mountPath: /dataset @@ -150,6 +151,8 @@ backend-python: mountPath: /flow - name: log-volume mountPath: /var/log/datamate + - name: operator-volume + mountPath: /operators gateway: env: diff --git a/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx b/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx index 057f446d..8636b7db 100644 --- a/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx +++ b/frontend/src/pages/DataCleansing/Create/components/ParamConfig.tsx @@ -26,11 +26,11 @@ const ParamConfig: React.FC = ({ onParamChange, }) => { if (!param) return null; - let defaultVal: any = param.defaultVal; + let defaultVal: any = operator.overrides?.[paramKey] ?? param.defaultVal; if (param.type === "range") { - defaultVal = Array.isArray(param.defaultVal) - ? param.defaultVal + defaultVal = Array.isArray(defaultVal) + ? defaultVal : [ param?.properties?.[0]?.defaultVal, param?.properties?.[1]?.defaultVal, @@ -217,6 +217,7 @@ const ParamConfig: React.FC = ({ key={paramKey} > { - const pythonProxyConfig = { - target: "http://localhost:18000", + proxy: { + "^/api": { + target: "http://localhost:8080", // 本地后端服务地址 changeOrigin: true, secure: false, - configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => { - proxy.on("proxyReq", (proxyReq: unknown) => { - (proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer"); - (proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin"); + rewrite: (path) => path.replace(/^\/api/, "/api"), + configure: (proxy, options) => { + // proxy 是 'http-proxy' 的实例 + proxy.on("proxyReq", (proxyReq, req, res) => { + // 可以在这里修改请求头 + proxyReq.removeHeader("referer"); + proxyReq.removeHeader("origin"); }); - proxy.on("proxyRes", (proxyRes: unknown) => { - const res = proxyRes as { headers: Record }; - delete res.headers["set-cookie"]; - res.headers["cookies"] = ""; + proxy.on("proxyRes", (proxyRes, req, res) => { + delete proxyRes.headers["set-cookie"]; + proxyRes.headers["cookies"] = ""; // 清除 cookies 头 }); }, - }; - - const javaProxyConfig = { - target: "http://localhost:8080", - changeOrigin: true, - secure: false, - configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => { - proxy.on("proxyReq", (proxyReq: unknown) => { - (proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer"); - (proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin"); - }); - proxy.on("proxyRes", (proxyRes: unknown) => { - const res = proxyRes as { headers: Record }; - delete res.headers["set-cookie"]; - res.headers["cookies"] = ""; - }); - }, - }; - - // Python 服务: rag, synthesis, annotation, evaluation, models - const pythonPaths = ["rag", "synthesis", "annotation", "data-collection", "evaluation", "models"]; - // Java 服务: data-management, knowledge-base - const javaPaths = ["data-management", "knowledge-base", "operators"]; - - const proxy: Record = {}; - for (const p of pythonPaths) { - proxy[`/api/${p}`] = pythonProxyConfig; - } - for (const p of javaPaths) { - proxy[`/api/${p}`] = javaProxyConfig; - } - return proxy; - })(), + }, + }, }, }); diff --git a/runtime/datamate-python/app/db/models/cleaning.py b/runtime/datamate-python/app/db/models/cleaning.py new file mode 100644 index 00000000..c2965be9 --- /dev/null +++ b/runtime/datamate-python/app/db/models/cleaning.py @@ -0,0 +1,59 @@ +from sqlalchemy import Column, String, BigInteger, Integer, TIMESTAMP +from app.db.models.base_entity import BaseEntity, Base + + +class CleaningTask(BaseEntity): + """Data cleaning task entity""" + __tablename__ = "t_clean_task" + + id = Column(String(36), primary_key=True, comment="Task ID") + name = Column(String(255), nullable=False, comment="Task name") + description = Column(String(1024), nullable=True, comment="Task description") + status = Column(String(50), nullable=False, default="PENDING", comment="Task status: PENDING, RUNNING, COMPLETED, STOPPED, FAILED") + src_dataset_id = Column(String(36), nullable=False, comment="Source dataset ID") + src_dataset_name = Column(String(255), nullable=False, comment="Source dataset name") + dest_dataset_id = Column(String(36), nullable=True, comment="Destination dataset ID") + dest_dataset_name = Column(String(255), nullable=True, comment="Destination dataset name") + before_size = Column(BigInteger, nullable=True, comment="Data size before cleaning") + after_size = Column(BigInteger, nullable=True, comment="Data size after cleaning") + file_count = Column(Integer, nullable=True, comment="Total file count") + retry_count = Column(Integer, default=0, nullable=False, comment="Retry count") + started_at = Column(TIMESTAMP, nullable=True, comment="Task start time") + finished_at = Column(TIMESTAMP, nullable=True, comment="Task finish time") + + +class CleaningTemplate(BaseEntity): + """Data cleaning template entity""" + __tablename__ = "t_clean_template" + + id = Column(String(36), primary_key=True, comment="Template ID") + name = Column(String(255), nullable=False, comment="Template name") + description = Column(String(1024), nullable=True, comment="Template description") + + +class CleaningResult(Base): + """Data cleaning result entity""" + __tablename__ = "t_clean_result" + + instance_id = Column(String(36), primary_key=True, comment="Instance ID (task or template ID)") + src_file_id = Column(String(36), primary_key=True, comment="Source file ID") + dest_file_id = Column(String(36), nullable=True, comment="Destination file ID") + src_name = Column(String(512), nullable=True, comment="Source file name") + dest_name = Column(String(512), nullable=True, comment="Destination file name") + src_type = Column(String(50), nullable=True, comment="Source file type") + dest_type = Column(String(50), nullable=True, comment="Destination file type") + src_size = Column(BigInteger, nullable=True, comment="Source file size") + dest_size = Column(BigInteger, nullable=True, comment="Destination file size") + status = Column(String(50), nullable=True, comment="Cleaning status: COMPLETED, FAILED, etc.") + result = Column(String(1024), nullable=True, comment="Cleaning result message") + + +class OperatorInstance(Base): + """Operator instance in task or template""" + __tablename__ = "t_operator_instance" + + instance_id = Column(String(36), primary_key=True, comment="Instance ID (task or template ID)") + operator_id = Column(String(36), primary_key=True, comment="Operator ID") + op_index = Column(Integer, nullable=False, comment="Operator execution order") + settings_override = Column(String(4096), nullable=True, comment="Operator settings override (JSON)") + diff --git a/runtime/datamate-python/app/module/__init__.py b/runtime/datamate-python/app/module/__init__.py index 9437b11d..edf8f547 100644 --- a/runtime/datamate-python/app/module/__init__.py +++ b/runtime/datamate-python/app/module/__init__.py @@ -9,6 +9,7 @@ from .rag.interface.rag_interface import router as rag_router from .operator.interface import operator_router from .operator.interface import category_router +from .cleaning.interface import router as cleaning_router router = APIRouter( prefix="/api" @@ -23,5 +24,6 @@ router.include_router(rag_router) router.include_router(operator_router) router.include_router(category_router) +router.include_router(cleaning_router) __all__ = ["router"] diff --git a/runtime/datamate-python/app/module/cleaning/__init__.py b/runtime/datamate-python/app/module/cleaning/__init__.py new file mode 100644 index 00000000..0d35bbc7 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/__init__.py @@ -0,0 +1,71 @@ +from .schema import ( + CleaningTaskStatus, + OperatorInstanceDto, + CleaningProcess, + CleaningTaskDto, + CreateCleaningTaskRequest, + CleaningResultDto, + CleaningTaskLog, + CleaningTemplateDto, + CreateCleaningTemplateRequest, + UpdateCleaningTemplateRequest, +) + +from .exceptions import ( + CleaningException, + CleaningNameDuplicationError, + CleaningTaskNotFoundError, + CleaningTemplateNotFoundError, + InvalidOperatorInputError, + ExecutorTypeError, + DatasetNotFoundError, + FileSystemError, + SettingsParseError, +) + +from .repository import ( + CleaningTaskRepository, + CleaningTemplateRepository, + CleaningResultRepository, + OperatorInstanceRepository, +) + +from .service import ( + CleanTaskValidator, + CleaningTaskScheduler, + CleaningTemplateService, + CleaningTaskService, +) + +from .runtime_client import RuntimeClient + +__all__ = [ + "CleaningTaskStatus", + "OperatorInstanceDto", + "CleaningProcess", + "CleaningTaskDto", + "CreateCleaningTaskRequest", + "CleaningResultDto", + "CleaningTaskLog", + "CleaningTemplateDto", + "CreateCleaningTemplateRequest", + "UpdateCleaningTemplateRequest", + "CleaningException", + "CleaningNameDuplicationError", + "CleaningTaskNotFoundError", + "CleaningTemplateNotFoundError", + "InvalidOperatorInputError", + "ExecutorTypeError", + "DatasetNotFoundError", + "FileSystemError", + "SettingsParseError", + "CleaningTaskRepository", + "CleaningTemplateRepository", + "CleaningResultRepository", + "OperatorInstanceRepository", + "CleanTaskValidator", + "CleaningTaskScheduler", + "CleaningTemplateService", + "CleaningTaskService", + "RuntimeClient", +] diff --git a/runtime/datamate-python/app/module/cleaning/exceptions.py b/runtime/datamate-python/app/module/cleaning/exceptions.py new file mode 100644 index 00000000..85c0718f --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/exceptions.py @@ -0,0 +1,57 @@ +from typing import Optional + + +class CleaningException(Exception): + """Base exception for cleaning module""" + def __init__(self, message: str, details: Optional[dict] = None): + self.message = message + self.details = details + super().__init__(self.message) + + +class CleaningNameDuplicationError(CleaningException): + """Exception raised when cleaning task name is duplicated""" + def __init__(self, name: str): + super().__init__(f"Cleaning task name '{name}' is duplicated") + + +class CleaningTaskNotFoundError(CleaningException): + """Exception raised when cleaning task is not found""" + def __init__(self, task_id: str): + super().__init__(f"Cleaning task '{task_id}' not found") + + +class CleaningTemplateNotFoundError(CleaningException): + """Exception raised when cleaning template is not found""" + def __init__(self, template_id: str): + super().__init__(f"Cleaning template '{template_id}' not found") + + +class InvalidOperatorInputError(CleaningException): + """Exception raised when operator input/output types are invalid""" + def __init__(self, message: str = "Invalid operator input/output types"): + super().__init__(message) + + +class ExecutorTypeError(CleaningException): + """Exception raised when executor type is invalid""" + def __init__(self, message: str = "Invalid executor type"): + super().__init__(message) + + +class DatasetNotFoundError(CleaningException): + """Exception raised when dataset is not found""" + def __init__(self, dataset_id: str): + super().__init__(f"Dataset '{dataset_id}' not found") + + +class FileSystemError(CleaningException): + """Exception raised when file system operations fail""" + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(f"File system error: {message}", details) + + +class SettingsParseError(CleaningException): + """Exception raised when operator settings parsing fails""" + def __init__(self, message: str, details: Optional[dict] = None): + super().__init__(f"Settings parse error: {message}", details) diff --git a/runtime/datamate-python/app/module/cleaning/interface/__init__.py b/runtime/datamate-python/app/module/cleaning/interface/__init__.py new file mode 100644 index 00000000..a8d5421d --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/interface/__init__.py @@ -0,0 +1,8 @@ +from fastapi import APIRouter + +from .cleaning_task_routes import router as task_router +from .cleaning_template_routes import router as template_router + +router = APIRouter() +router.include_router(task_router) +router.include_router(template_router) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py new file mode 100644 index 00000000..dae2a96c --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -0,0 +1,259 @@ +from typing import Optional + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.logging import get_logger +from app.db.session import get_db +from app.module.cleaning.schema import ( + CleaningTaskDto, + CreateCleaningTaskRequest, + CleaningResultDto, + CleaningTaskLog, +) +from app.module.cleaning.service import CleaningTaskService +from app.module.shared.schema import StandardResponse, PaginatedData + +logger = get_logger(__name__) + +router = APIRouter(prefix="/cleaning/tasks", tags=["Cleaning Tasks"]) + + +def _get_operator_service(): + """Get operator service""" + from app.module.operator.service import OperatorService + from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, + ) + from app.module.operator.parsers import ParserHolder + from app.module.shared.file_service import FileService + from app.module.shared.chunk_upload_repository import ChunkUploadRepository + + return OperatorService( + operator_repo=OperatorRepository(None), + category_relation_repo=CategoryRelationRepository(None), + operator_release_repo=OperatorReleaseRepository(None), + parser_holder=ParserHolder(), + file_service=FileService(ChunkUploadRepository()), + ) + + +def _get_task_service(db: AsyncSession) -> CleaningTaskService: + """Get cleaning task service instance""" + from app.module.cleaning.service import ( + CleaningTaskScheduler, + CleanTaskValidator, + ) + from app.module.cleaning.repository import ( + CleaningTaskRepository, + CleaningResultRepository, + OperatorInstanceRepository, + ) + from app.module.cleaning.runtime_client import RuntimeClient + from app.module.dataset.service import DatasetManagementService + + runtime_client = RuntimeClient() + scheduler = CleaningTaskScheduler( + task_repo=CleaningTaskRepository(None), + runtime_client=runtime_client + ) + operator_service = _get_operator_service() + dataset_service = DatasetManagementService(db) + + return CleaningTaskService( + task_repo=CleaningTaskRepository(None), + result_repo=CleaningResultRepository(None), + operator_instance_repo=OperatorInstanceRepository(None), + operator_service=operator_service, + scheduler=scheduler, + validator=CleanTaskValidator(), + dataset_service=dataset_service, + ) + + +@router.get( + "", + response_model=StandardResponse[PaginatedData[CleaningTaskDto]], + summary="查询清洗任务列表", + description="根据参数查询清洗任务列表(支持分页、状态过滤、关键词搜索)", + tags=['mcp'] +) +async def get_cleaning_tasks( + page: int = 0, + size: int = 10, + status: Optional[str] = None, + keyword: Optional[str] = None, + db: AsyncSession = Depends(get_db), +): + """Query cleaning tasks""" + task_service = _get_task_service(db) + + tasks = await task_service.get_tasks(db, status, keyword, page, size) + count = await task_service.count_tasks(db, status, keyword) + total_pages = (count + size - 1) // size if size > 0 else 0 + + return StandardResponse( + code=200, + message="success", + data=PaginatedData( + page=page, + size=size, + total_elements=count, + total_pages=total_pages, + content=tasks, + ) + ) + + +@router.post( + "", + response_model=StandardResponse[CleaningTaskDto], + summary="创建清洗任务", + description="根据模板ID或算子列表创建清洗任务", + tags=['mcp'] +) +async def create_cleaning_task( + request: CreateCleaningTaskRequest, + db: AsyncSession = Depends(get_db), +): + """Create cleaning task""" + try: + task_service = _get_task_service(db) + + task = await task_service.create_task(db, request) + await db.commit() + + await task_service.execute_task(db, task.id) + await db.commit() + + return StandardResponse(code=200, message="success", data=task) + except Exception as e: + await db.rollback() + logger.error(f"Failed to create cleaning task: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/{task_id}", + response_model=StandardResponse[CleaningTaskDto], + summary="获取清洗任务详情", + description="根据ID获取清洗任务详细信息" +) +async def get_cleaning_task( + task_id: str, + db: AsyncSession = Depends(get_db), +): + """Get cleaning task by ID""" + try: + task_service = _get_task_service(db) + task = await task_service.get_task(db, task_id) + return StandardResponse(code=200, message="success", data=task) + except Exception as e: + logger.error(f"Failed to get cleaning task {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=404, detail=str(e)) + + +@router.delete( + "/{task_id}", + response_model=StandardResponse[str], + summary="删除清洗任务", + description="删除指定的清洗任务" +) +async def delete_cleaning_task( + task_id: str, + db: AsyncSession = Depends(get_db), +): + """Delete cleaning task""" + try: + task_service = _get_task_service(db) + await task_service.delete_task(db, task_id) + await db.commit() + return StandardResponse(code=200, message="success", data=task_id) + except Exception as e: + await db.rollback() + logger.error(f"Failed to delete cleaning task {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/{task_id}/stop", + response_model=StandardResponse[str], + summary="停止清洗任务", + description="停止正在运行的清洗任务" +) +async def stop_cleaning_task( + task_id: str, + db: AsyncSession = Depends(get_db), +): + """Stop cleaning task""" + try: + task_service = _get_task_service(db) + await task_service.stop_task(db, task_id) + return StandardResponse(code=200, message="success", data=task_id) + except Exception as e: + logger.error(f"Failed to stop cleaning task {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "/{task_id}/execute", + response_model=StandardResponse[str], + summary="执行清洗任务", + description="重新执行清洗任务" +) +async def execute_cleaning_task( + task_id: str, + db: AsyncSession = Depends(get_db), +): + """Execute cleaning task""" + try: + task_service = _get_task_service(db) + await task_service.execute_task(db, task_id) + return StandardResponse(code=200, message="success", data=task_id) + except Exception as e: + await db.rollback() + logger.error(f"Failed to execute cleaning task {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/{task_id}/result", + response_model=StandardResponse[list[CleaningResultDto]], + summary="获取清洗任务结果", + description="获取指定清洗任务的执行结果" +) +async def get_cleaning_task_results( + task_id: str, + db: AsyncSession = Depends(get_db), +): + """Get cleaning task results""" + try: + task_service = _get_task_service(db) + results = await task_service.get_task_results(db, task_id) + return StandardResponse(code=200, message="success", data=results) + except Exception as e: + logger.error(f"Failed to get task results {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/{task_id}/log/{retry_count}", + response_model=StandardResponse[list[CleaningTaskLog]], + summary="获取清洗任务日志", + description="获取指定清洗任务的执行日志" +) +async def get_cleaning_task_log( + task_id: str, + retry_count: int, + db: AsyncSession = Depends(get_db), +): + """Get cleaning task log""" + try: + task_service = _get_task_service(db) + logs = await task_service.get_task_log(db, task_id, retry_count) + return StandardResponse(code=200, message="success", data=logs) + except Exception as e: + logger.error(f"Failed to get task log {task_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py new file mode 100644 index 00000000..3da722e0 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -0,0 +1,198 @@ +import math +from typing import Optional +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy import select, func + +from app.core.logging import get_logger +from app.db.session import get_db +from app.module.cleaning.schema import ( + CleaningTemplateDto, + CreateCleaningTemplateRequest, + UpdateCleaningTemplateRequest, + OperatorInstanceDto, +) +from app.module.cleaning.service import CleaningTemplateService +from app.module.shared.schema import StandardResponse, PaginatedData +from sqlalchemy.ext.asyncio import AsyncSession + +logger = get_logger(__name__) + +router = APIRouter(prefix="/cleaning/templates", tags=["Cleaning Templates"]) + + +def _get_operator_service(): + """Get operator service""" + from app.module.operator.service import OperatorService + from app.module.operator.repository import ( + OperatorRepository, + CategoryRelationRepository, + OperatorReleaseRepository, + ) + from app.module.operator.parsers import ParserHolder + from app.module.shared.file_service import FileService + from app.module.shared.chunk_upload_repository import ChunkUploadRepository + + return OperatorService( + operator_repo=OperatorRepository(None), + category_relation_repo=CategoryRelationRepository(None), + operator_release_repo=OperatorReleaseRepository(None), + parser_holder=ParserHolder(), + file_service=FileService(ChunkUploadRepository()), + ) + + +def _get_template_service(db: AsyncSession) -> CleaningTemplateService: + """Get cleaning template service instance""" + from app.module.cleaning.service import CleanTaskValidator + from app.module.cleaning.repository import ( + CleaningTemplateRepository, + OperatorInstanceRepository, + ) + from app.db.models.cleaning import CleaningTemplate, OperatorInstance + + operator_service = _get_operator_service() + + return CleaningTemplateService( + template_repo=CleaningTemplateRepository(None), + operator_instance_repo=OperatorInstanceRepository(None), + operator_service=operator_service, + validator=CleanTaskValidator(), + ) + + +@router.get( + "", + response_model=StandardResponse[PaginatedData[CleaningTemplateDto]], + summary="查询清洗模板列表", + description="分页查询清洗模板" +) +async def get_cleaning_templates( + page: int = Query(1, description="页码"), + size: int = Query(20, description="每页数量"), + keyword: Optional[str] = Query(None, description="关键词搜索"), + db: AsyncSession = Depends(get_db), +): + """Query cleaning templates with pagination""" + try: + from app.db.models.cleaning import CleaningTemplate + + template_service = _get_template_service(db) + + query = select(CleaningTemplate) + + if keyword: + keyword_pattern = f"%{keyword}%" + query = query.where( + CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern) + ) + + count_query = select(func.count()).select_from(query.subquery()) + total = (await db.execute(count_query)).scalar_one() + items = await template_service.get_templates(db, keyword) + + total_pages = math.ceil(total / size) if total > 0 else 0 + + return StandardResponse( + code=200, + message="success", + data=PaginatedData( + content=items, + total_elements=total, + total_pages=total_pages, + page=page, + size=size, + ) + ) + except Exception as e: + logger.error(f"Failed to get cleaning templates: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post( + "", + response_model=StandardResponse[CleaningTemplateDto], + summary="创建清洗模板", + description="创建新的清洗模板" +) +async def create_cleaning_template( + request: CreateCleaningTemplateRequest, + db: AsyncSession = Depends(get_db), +): + """Create cleaning template""" + try: + template_service = _get_template_service(db) + + template = await template_service.create_template(db, request) + await db.commit() + return StandardResponse(code=200, message="success", data=template) + except Exception as e: + await db.rollback() + logger.error(f"Failed to create cleaning template: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get( + "/{template_id}", + response_model=StandardResponse[CleaningTemplateDto], + summary="获取清洗模板详情", + description="根据ID获取清洗模板详细信息" +) +async def get_cleaning_template( + template_id: str, + db: AsyncSession = Depends(get_db), +): + """Get cleaning template by ID""" + try: + template_service = _get_template_service(db) + + template = await template_service.get_template(db, template_id) + return StandardResponse(code=200, message="success", data=template) + except Exception as e: + logger.error(f"Failed to get cleaning template {template_id}: {e}", exc_info=True) + raise HTTPException(status_code=404, detail=str(e)) + + +@router.put( + "/{template_id}", + response_model=StandardResponse[CleaningTemplateDto], + summary="更新清洗模板", + description="更新清洗模板信息" +) +async def update_cleaning_template( + template_id: str, + request: UpdateCleaningTemplateRequest, + db: AsyncSession = Depends(get_db), +): + """Update cleaning template""" + try: + template_service = _get_template_service(db) + + template = await template_service.update_template(db, template_id, request) + await db.commit() + return StandardResponse(code=200, message="success", data=template) + except Exception as e: + await db.rollback() + logger.error(f"Failed to update cleaning template {template_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) + + +@router.delete( + "/{template_id}", + response_model=StandardResponse[str], + summary="删除清洗模板", + description="删除指定的清洗模板" +) +async def delete_cleaning_template( + template_id: str, + db: AsyncSession = Depends(get_db), +): + """Delete cleaning template""" + try: + template_service = _get_template_service(db) + await template_service.delete_template(db, template_id) + await db.commit() + return StandardResponse(code=200, message="success", data=template_id) + except Exception as e: + await db.rollback() + logger.error(f"Failed to delete cleaning template {template_id}: {e}", exc_info=True) + raise HTTPException(status_code=400, detail=str(e)) diff --git a/runtime/datamate-python/app/module/cleaning/repository/__init__.py b/runtime/datamate-python/app/module/cleaning/repository/__init__.py new file mode 100644 index 00000000..f8663a94 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/repository/__init__.py @@ -0,0 +1,11 @@ +from .cleaning_task_repository import CleaningTaskRepository +from .cleaning_template_repository import CleaningTemplateRepository +from .cleaning_result_repository import CleaningResultRepository +from .operator_instance_repository import OperatorInstanceRepository + +__all__ = [ + "CleaningTaskRepository", + "CleaningTemplateRepository", + "CleaningResultRepository", + "OperatorInstanceRepository", +] diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py new file mode 100644 index 00000000..a6aa62e3 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_result_repository.py @@ -0,0 +1,75 @@ +from typing import List, Optional +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, delete +from app.db.models.cleaning import CleaningResult +from app.module.cleaning.schema import CleaningResultDto + + +class CleaningResultRepository: + """Repository for cleaning result operations""" + + def __init__(self, model=None): + self.model = model if model else CleaningResult + + async def find_by_instance_id( + self, + db: AsyncSession, + instance_id: str, + status: Optional[str] = None + ) -> List[CleaningResultDto]: + """Query results by instance ID""" + query = select(self.model).where(self.model.instance_id == instance_id) + + if status: + query = query.where(self.model.status == status) + + result = await db.execute(query) + results = result.scalars().all() + + return [ + CleaningResultDto( + instance_id=res.instance_id, + src_file_id=res.src_file_id, + dest_file_id=res.dest_file_id, + src_name=res.src_name, + dest_name=res.dest_name, + src_type=res.src_type, + dest_type=res.dest_type, + src_size=res.src_size, + dest_size=res.dest_size, + status=res.status, + result=res.result + ) + for res in results + ] + + async def count_by_instance_id( + self, + db: AsyncSession, + instance_id: str + ) -> tuple[int, int]: + """Count results by instance ID (completed, failed)""" + total_query = select(self.model).where(self.model.instance_id == instance_id) + completed_query = total_query.where(self.model.status == "COMPLETED") + failed_query = total_query.where(self.model.status == "FAILED") + + total = len((await db.execute(total_query)).scalars().all()) + completed = len((await db.execute(completed_query)).scalars().all()) + failed = len((await db.execute(failed_query)).scalars().all()) + + return (completed, failed) + + async def delete_by_instance_id( + self, + db: AsyncSession, + instance_id: str, + status: Optional[str] = None + ) -> None: + """Delete results by instance ID""" + query = delete(self.model).where(self.model.instance_id == instance_id) + + if status: + query = query.where(self.model.status == status) + + await db.execute(query) + await db.flush() diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py new file mode 100644 index 00000000..56b1bfd0 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py @@ -0,0 +1,134 @@ +from typing import List, Optional +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, delete +from app.db.models.cleaning import CleaningTask +from app.module.cleaning.schema import CleaningTaskDto + + +class CleaningTaskRepository: + """Repository for cleaning task operations""" + + def __init__(self, model=None): + self.model = model if model else CleaningTask + + async def find_tasks( + self, + db: AsyncSession, + status: Optional[str] = None, + keyword: Optional[str] = None, + page: Optional[int] = None, + size: Optional[int] = None + ) -> List[CleaningTaskDto]: + """Query cleaning tasks""" + query = select(self.model) + + if status: + query = query.where(self.model.status == status) + + if keyword: + keyword_pattern = f"%{keyword}%" + query = query.where( + self.model.name.ilike(keyword_pattern) | self.model.description.ilike(keyword_pattern) + ) + + query = query.order_by(self.model.created_at.desc()) + + if page is not None and size is not None: + offset = max((page - 1) * size, 0) + query = query.offset(offset).limit(size) + + result = await db.execute(query) + tasks = result.scalars().all() + + return [ + CleaningTaskDto( + id=task.id, + name=task.name, + description=task.description, + status=task.status, + src_dataset_id=task.src_dataset_id, + src_dataset_name=task.src_dataset_name, + dest_dataset_id=task.dest_dataset_id, + dest_dataset_name=task.dest_dataset_name, + before_size=task.before_size, + after_size=task.after_size, + file_count=task.file_count, + retry_count=task.retry_count, + started_at=task.started_at, + finished_at=task.finished_at, + created_at=task.created_at + ) + for task in tasks + ] + + async def find_task_by_id(self, db: AsyncSession, task_id: str) -> Optional[CleaningTaskDto]: + """Query task by ID""" + query = select(self.model).where(self.model.id == task_id) + result = await db.execute(query) + task = result.scalar_one_or_none() + + if not task: + return None + + return CleaningTaskDto( + id=task.id, + name=task.name, + description=task.description, + status=task.status, + src_dataset_id=task.src_dataset_id, + src_dataset_name=task.src_dataset_name, + dest_dataset_id=task.dest_dataset_id, + dest_dataset_name=task.dest_dataset_name, + before_size=task.before_size, + after_size=task.after_size, + file_count=task.file_count, + retry_count=task.retry_count, + started_at=task.started_at, + finished_at=task.finished_at, + created_at=task.created_at + ) + + async def insert_task(self, db: AsyncSession, task: CleaningTaskDto) -> None: + """Insert new task""" + from app.db.models.cleaning import CleaningTask as CleaningTaskModel + + db_task = CleaningTaskModel( + id=task.id, + name=task.name, + description=task.description, + status=task.status, + src_dataset_id=task.src_dataset_id, + src_dataset_name=task.src_dataset_name, + dest_dataset_id=task.dest_dataset_id, + dest_dataset_name=task.dest_dataset_name, + before_size=task.before_size, + after_size=task.after_size, + file_count=task.file_count, + retry_count=task.retry_count + ) + db.add(db_task) + await db.flush() + + async def update_task(self, db: AsyncSession, task: CleaningTaskDto) -> None: + """Update task""" + query = select(CleaningTask).where(CleaningTask.id == task.id) + result = await db.execute(query) + db_task = result.scalar_one_or_none() + + if db_task: + if task.status: + db_task.status = task.status + if task.started_at: + db_task.started_at = task.started_at + if task.finished_at: + db_task.finished_at = task.finished_at + if task.retry_count is not None: + db_task.retry_count = task.retry_count + + await db.flush() + + async def delete_task_by_id(self, db: AsyncSession, task_id: str) -> None: + """Delete task by ID""" + query = delete(self.model).where(self.model.id == task_id) + await db.execute(query) + await db.flush() diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py new file mode 100644 index 00000000..b2aab16e --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py @@ -0,0 +1,57 @@ +from typing import List, Optional +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, delete +from app.db.models.cleaning import CleaningTemplate + + +class CleaningTemplateRepository: + """Repository for cleaning template operations""" + + def __init__(self, model=None): + self.model = model if model else CleaningTemplate + + async def find_all_templates( + self, + db: AsyncSession, + keyword: Optional[str] = None + ) -> List[CleaningTemplate]: + """Query all templates""" + query = select(self.model) + + if keyword: + keyword_pattern = f"%{keyword}%" + query = query.where( + self.model.name.ilike(keyword_pattern) | self.model.description.ilike(keyword_pattern) + ) + + query = query.order_by(self.model.created_at.desc()) + result = await db.execute(query) + return result.scalars().all() + + async def find_template_by_id(self, db: AsyncSession, template_id: str) -> Optional[CleaningTemplate]: + """Query template by ID""" + query = select(self.model).where(self.model.id == template_id) + result = await db.execute(query) + return result.scalar_one_or_none() + + async def insert_template(self, db: AsyncSession, template: CleaningTemplate) -> None: + """Insert new template""" + db.add(template) + await db.flush() + + async def update_template(self, db: AsyncSession, template: CleaningTemplate) -> None: + """Update template""" + query = select(self.model).where(self.model.id == template.id) + result = await db.execute(query) + db_template = result.scalar_one_or_none() + + if db_template: + db_template.name = template.name + db_template.description = template.description + await db.flush() + + async def delete_template(self, db: AsyncSession, template_id: str) -> None: + """Delete template""" + query = delete(self.model).where(self.model.id == template_id) + await db.execute(query) + await db.flush() diff --git a/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py b/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py new file mode 100644 index 00000000..b8a20b1d --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/repository/operator_instance_repository.py @@ -0,0 +1,56 @@ +import json +from typing import List +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, delete +from app.db.models.cleaning import OperatorInstance + + +class OperatorInstanceRepository: + """Repository for operator instance operations""" + + def __init__(self, model=None): + self.model = model if model else OperatorInstance + + async def find_operator_by_instance_id( + self, + db: AsyncSession, + instance_id: str + ) -> List[OperatorInstance]: + """Query operator instances by instance ID""" + query = select(self.model).where(self.model.instance_id == instance_id) + query = query.order_by(self.model.op_index.asc()) + result = await db.execute(query) + return result.scalars().all() + + async def find_instance_by_instance_id( + self, + db: AsyncSession, + instance_id: str + ) -> List[OperatorInstance]: + """Query instances for template (same as find_operator_by_instance_id)""" + return await self.find_operator_by_instance_id(db, instance_id) + + async def insert_instance( + self, + db: AsyncSession, + instance_id: str, + instances: List + ) -> None: + """Insert operator instances""" + from app.db.models.cleaning import OperatorInstance as OperatorInstanceModel + + for idx, instance in enumerate(instances): + db_instance = OperatorInstanceModel( + instance_id=instance_id, + operator_id=instance.id, + op_index=idx, + settings_override=json.dumps(instance.overrides), + ) + db.add(db_instance) + await db.flush() + + async def delete_by_instance_id(self, db: AsyncSession, instance_id: str) -> None: + """Delete instances by instance ID""" + query = delete(self.model).where(self.model.instance_id == instance_id) + await db.execute(query) + await db.flush() diff --git a/runtime/datamate-python/app/module/cleaning/runtime_client.py b/runtime/datamate-python/app/module/cleaning/runtime_client.py new file mode 100644 index 00000000..0983256f --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/runtime_client.py @@ -0,0 +1,61 @@ +import httpx +from typing import Optional +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +class RuntimeClient: + """HTTP client for communicating with runtime service""" + + def __init__(self, base_url: str = "http://datamate-runtime:8081"): + self.base_url = base_url + self.client = httpx.AsyncClient(timeout=60.0) + + async def submit_task(self, task_id: str) -> bool: + """Submit cleaning task to runtime executor""" + try: + url = f"{self.base_url}/api/task/{task_id}/submit" + response = await self.client.post(url) + response.raise_for_status() + logger.info(f"Task {task_id} submitted successfully") + return True + except httpx.HTTPError as e: + logger.error(f"Failed to submit task {task_id}: {e}") + return False + except Exception as e: + logger.error(f"Unexpected error submitting task {task_id}: {e}") + return False + + async def stop_task(self, task_id: str) -> bool: + """Stop running cleaning task""" + try: + url = f"{self.base_url}/api/task/{task_id}/stop" + response = await self.client.post(url) + response.raise_for_status() + logger.info(f"Task {task_id} stopped successfully") + return True + except httpx.HTTPError as e: + logger.error(f"Failed to stop task {task_id}: {e}") + return False + except Exception as e: + logger.error(f"Unexpected error stopping task {task_id}: {e}") + return False + + async def get_task_status(self, task_id: str) -> Optional[dict]: + """Get task status from runtime""" + try: + url = f"{self.base_url}/api/task/{task_id}/status" + response = await self.client.get(url) + response.raise_for_status() + return response.json() + except httpx.HTTPError as e: + logger.error(f"Failed to get task status {task_id}: {e}") + return None + except Exception as e: + logger.error(f"Unexpected error getting task status {task_id}: {e}") + return None + + async def close(self): + """Close HTTP client""" + await self.client.aclose() diff --git a/runtime/datamate-python/app/module/cleaning/schema/__init__.py b/runtime/datamate-python/app/module/cleaning/schema/__init__.py new file mode 100644 index 00000000..6a38375b --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/schema/__init__.py @@ -0,0 +1,25 @@ +from .cleaning import ( + CleaningTaskStatus, + OperatorInstanceDto, + CleaningProcess, + CleaningTaskDto, + CreateCleaningTaskRequest, + CleaningResultDto, + CleaningTaskLog, + CleaningTemplateDto, + CreateCleaningTemplateRequest, + UpdateCleaningTemplateRequest, +) + +__all__ = [ + "CleaningTaskStatus", + "OperatorInstanceDto", + "CleaningProcess", + "CleaningTaskDto", + "CreateCleaningTaskRequest", + "CleaningResultDto", + "CleaningTaskLog", + "CleaningTemplateDto", + "CreateCleaningTemplateRequest", + "UpdateCleaningTemplateRequest", +] diff --git a/runtime/datamate-python/app/module/cleaning/schema/cleaning.py b/runtime/datamate-python/app/module/cleaning/schema/cleaning.py new file mode 100644 index 00000000..0571b29c --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/schema/cleaning.py @@ -0,0 +1,138 @@ +from typing import Optional, List, Dict, Any +from pydantic import BaseModel, Field +from datetime import datetime +from app.module.shared.schema.common import BaseResponseModel + + +class CleaningTaskStatus: + PENDING = "PENDING" + RUNNING = "RUNNING" + COMPLETED = "COMPLETED" + STOPPED = "STOPPED" + FAILED = "FAILED" + + +class OperatorInstanceDto(BaseResponseModel): + """Operator instance DTO for task or template""" + id: str = Field(..., description="Operator ID") + name: Optional[str] = Field(None, description="Operator name") + description: Optional[str] = Field(None, description="Operator description") + inputs: Optional[str] = Field(None, description="Input types: text/image/audio/video/multimodal") + outputs: Optional[str] = Field(None, description="Output types: text/image/audio/video/multimodal") + categories: Optional[List[str]] = Field(None, description="Category IDs") + settings: Optional[str] = Field(None, description="算子设置(JSON)") + overrides: Dict[str, Any] = Field(default_factory=dict, description="Operator parameter overrides") + + +class CleaningProcess(BaseResponseModel): + """Task progress information (matches Java version)""" + process: float = Field(..., description="Progress percentage") + successRate: float = Field(..., description="Success rate percentage") + totalFileNum: int = Field(..., description="Total file count") + succeedFileNum: int = Field(..., description="Succeeded file count") + failedFileNum: int = Field(..., description="Failed file count") + finishedFileNum: int = Field(..., description="Finished file count") + + @classmethod + def of(cls, total: int, succeed: int, failed: int) -> 'CleaningProcess': + """Create progress info (matches Java version logic)""" + finished_file_num = succeed + failed + + if total == 0: + process = 0.0 + else: + process = round(finished_file_num * 100.0 / total, 2) + + if finished_file_num == 0: + success_rate = 0.0 + else: + success_rate = round(succeed * 100.0 / finished_file_num, 2) + + return cls( + process=process, + successRate=success_rate, + totalFileNum=total, + succeedFileNum=succeed, + failedFileNum=failed, + finishedFileNum=finished_file_num, + ) + + +class CleaningTaskDto(BaseResponseModel): + """Cleaning task DTO""" + id: Optional[str] = Field(None, description="Task ID") + name: Optional[str] = Field(None, description="Task name") + description: Optional[str] = Field(None, description="Task description") + src_dataset_id: Optional[str] = Field(None, description="Source dataset ID") + src_dataset_name: Optional[str] = Field(None, description="Source dataset name") + dest_dataset_id: Optional[str] = Field(None, description="Destination dataset ID") + dest_dataset_name: Optional[str] = Field(None, description="Destination dataset name") + before_size: Optional[int] = Field(None, description="Data size before cleaning") + after_size: Optional[int] = Field(None, description="Data size after cleaning") + file_count: Optional[int] = Field(None, description="Total file count") + retry_count: Optional[int] = Field(None, description="Retry count") + status: Optional[str] = Field(None, description="Task status") + template_id: Optional[str] = Field(None, description="Template ID if created from template") + instance: Optional[List[OperatorInstanceDto]] = Field(None, description="Operator instances") + progress: Optional[CleaningProcess] = Field(None, description="Task progress") + created_at: Optional[datetime] = Field(None, description="Creation time") + started_at: Optional[datetime] = Field(None, description="Start time") + finished_at: Optional[datetime] = Field(None, description="Finish time") + + +class CreateCleaningTaskRequest(BaseResponseModel): + """Request to create cleaning task""" + name: str = Field(..., description="Cleaning task name") + description: str = Field(..., description="Cleaning task description") + src_dataset_id: str = Field(..., description="Source dataset ID") + src_dataset_name: str = Field(..., description="Source dataset name") + dest_dataset_id: Optional[str] = Field(None, description="Destination dataset ID") + dest_dataset_name: str = Field(..., description="Destination dataset name, creates new dataset if destDatasetId is empty") + dest_dataset_type: str = Field(..., description="Destination dataset type: TEXT/IMAGE/VIDEO/AUDIO/OTHER") + template_id: Optional[str] = Field(None, description="Template ID (alternative to instance)") + instance: List[OperatorInstanceDto] = Field(default_factory=list, description="Operator list (alternative to templateId)") + + +class CleaningResultDto(BaseResponseModel): + """Cleaning result DTO""" + instance_id: Optional[str] = Field(None, description="Instance ID") + src_file_id: Optional[str] = Field(None, description="Source file ID") + dest_file_id: Optional[str] = Field(None, description="Destination file ID") + src_name: Optional[str] = Field(None, description="Source file name") + dest_name: Optional[str] = Field(None, description="Destination file name") + src_type: Optional[str] = Field(None, description="Source file type") + dest_type: Optional[str] = Field(None, description="Destination file type") + src_size: Optional[int] = Field(None, description="Source file size") + dest_size: Optional[int] = Field(None, description="Destination file size") + status: Optional[str] = Field(None, description="Cleaning status") + result: Optional[str] = Field(None, description="Cleaning result message") + + +class CleaningTaskLog(BaseResponseModel): + """Task log entry""" + level: str = Field(..., description="Log level: INFO, WARN, ERROR") + message: str = Field(..., description="Log message") + + +class CleaningTemplateDto(BaseResponseModel): + """Cleaning template DTO""" + id: Optional[str] = Field(None, description="Template ID") + name: Optional[str] = Field(None, description="Template name") + description: Optional[str] = Field(None, description="Template description") + instance: List[OperatorInstanceDto] = Field(default_factory=list, description="Operator instances") + created_at: Optional[datetime] = Field(None, description="Creation time") + updated_at: Optional[datetime] = Field(None, description="Update time") + + +class CreateCleaningTemplateRequest(BaseResponseModel): + """Request to create cleaning template""" + name: str = Field(..., description="Template name") + description: str = Field(..., description="Template description") + instance: List[OperatorInstanceDto] = Field(..., description="Operator instances") + + +class UpdateCleaningTemplateRequest(BaseResponseModel): + """Request to update cleaning template""" + name: str = Field(..., description="Template name") + description: str = Field(..., description="Template description") + instance: List[OperatorInstanceDto] = Field(..., description="Operator instances") diff --git a/runtime/datamate-python/app/module/cleaning/service/__init__.py b/runtime/datamate-python/app/module/cleaning/service/__init__.py new file mode 100644 index 00000000..4a70f7a2 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/service/__init__.py @@ -0,0 +1,11 @@ +from .clean_task_validator import CleanTaskValidator +from .cleaning_task_scheduler import CleaningTaskScheduler +from .cleaning_template_service import CleaningTemplateService +from .cleaning_task_service import CleaningTaskService + +__all__ = [ + "CleanTaskValidator", + "CleaningTaskScheduler", + "CleaningTemplateService", + "CleaningTaskService", +] diff --git a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py new file mode 100644 index 00000000..4ea94464 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py @@ -0,0 +1,61 @@ +import re +from app.module.cleaning.schema import OperatorInstanceDto +from app.module.cleaning.exceptions import InvalidOperatorInputError, ExecutorTypeError + + +class CleanTaskValidator: + """Validator for cleaning tasks and templates""" + + @staticmethod + def check_input_and_output(instances: list[OperatorInstanceDto]) -> None: + """Validate that operator input/output types are compatible""" + if not instances: + return + + for i in range(len(instances) - 1): + current = instances[i] + next_op = instances[i + 1] + + if not current.outputs: + raise InvalidOperatorInputError(f"Operator {current.id} has no outputs defined") + + if not next_op.inputs: + raise InvalidOperatorInputError(f"Operator {next_op.id} has no inputs defined") + + current_outputs = set(current.outputs.split(',')) + next_inputs = set(next_op.inputs.split(',')) + + if not current_outputs.intersection(next_inputs): + raise InvalidOperatorInputError( + f"Operator {current.id} outputs {current.outputs} " + f"but operator {next_op.id} requires {next_op.inputs}" + ) + + @staticmethod + def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: + """Check operator categories and determine executor type (datamate/datajuicer)""" + if not instances: + return "datamate" + + executor_types = set() + + for instance in instances: + if instance.categories: + for category in instance.categories: + if "datajuicer" in category.lower(): + executor_types.add("datajuicer") + elif "datamate" in category.lower(): + executor_types.add("datamate") + + if len(executor_types) > 1: + raise ExecutorTypeError( + "Cannot mix DataMate and DataJuicer operators in same task" + ) + + return executor_types.pop() if executor_types else "datamate" + + @staticmethod + def check_task_id(task_id: str) -> None: + """Validate task ID""" + if not task_id: + raise ValueError("Task ID cannot be empty") diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py new file mode 100644 index 00000000..cd1d1321 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_scheduler.py @@ -0,0 +1,41 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from app.core.logging import get_logger +from app.module.cleaning.repository import CleaningTaskRepository +from app.module.cleaning.runtime_client import RuntimeClient + +logger = get_logger(__name__) + + +class CleaningTaskScheduler: + """Scheduler for executing cleaning tasks""" + + def __init__(self, task_repo: CleaningTaskRepository, runtime_client: RuntimeClient): + self.task_repo = task_repo + self.runtime_client = runtime_client + + async def execute_task(self, db: AsyncSession, task_id: str, retry_count: int) -> bool: + """Execute cleaning task""" + from app.module.cleaning.schema import CleaningTaskDto, CleaningTaskStatus + from datetime import datetime + + task = CleaningTaskDto() + task.id = task_id + task.status = CleaningTaskStatus.RUNNING + task.started_at = datetime.now() + task.retry_count = retry_count + + await self.task_repo.update_task(db, task) + return await self.runtime_client.submit_task(task_id) + + async def stop_task(self, db: AsyncSession, task_id: str) -> bool: + """Stop cleaning task""" + from app.module.cleaning.schema import CleaningTaskDto, CleaningTaskStatus + + await self.runtime_client.stop_task(task_id) + + task = CleaningTaskDto() + task.id = task_id + task.status = CleaningTaskStatus.STOPPED + + await self.task_repo.update_task(db, task) + return True diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py new file mode 100644 index 00000000..8350d179 --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -0,0 +1,408 @@ +import json +import os +import uuid +import re +import shutil +from pathlib import Path +from typing import List, Dict, Any, Set +from datetime import datetime + +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import text + +from app.core.logging import get_logger +from app.module.cleaning.schema import ( + CleaningTaskDto, + CreateCleaningTaskRequest, + CleaningResultDto, + CleaningTaskLog, + OperatorInstanceDto, + CleaningProcess, + CleaningTaskStatus, +) +from app.module.cleaning.repository import ( + CleaningTaskRepository, + CleaningResultRepository, + OperatorInstanceRepository, +) +from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler +from app.module.cleaning.service.clean_task_validator import CleanTaskValidator +from app.module.cleaning.exceptions import ( + CleaningTaskNotFoundError, + FileSystemError, +) + +logger = get_logger(__name__) + +DATASET_PATH = "/dataset" +FLOW_PATH = "/flow" + + +class CleaningTaskService: + """Service for managing cleaning tasks""" + + def __init__( + self, + task_repo: CleaningTaskRepository, + result_repo: CleaningResultRepository, + operator_instance_repo: OperatorInstanceRepository, + operator_service, + scheduler: CleaningTaskScheduler, + validator: CleanTaskValidator, + dataset_service, + ): + self.task_repo = task_repo + self.result_repo = result_repo + self.operator_instance_repo = operator_instance_repo + self.operator_service = operator_service + self.scheduler = scheduler + self.validator = validator + self.dataset_service = dataset_service + + async def get_tasks( + self, + db: AsyncSession, + status: str | None = None, + keyword: str | None = None, + page: int | None = None, + size: int | None = None, + ) -> List[CleaningTaskDto]: + """Get cleaning tasks""" + tasks = await self.task_repo.find_tasks(db, status, keyword, page, size) + + for task in tasks: + await self._set_process(db, task) + + return tasks + + async def _set_process(self, db: AsyncSession, task: CleaningTaskDto) -> None: + """Set task progress""" + completed, failed = await self.result_repo.count_by_instance_id(db, task.id) + task.progress = CleaningProcess.of(task.file_count or 0, completed, failed) + + async def count_tasks( + self, + db: AsyncSession, + status: str | None = None, + keyword: str | None = None, + ) -> int: + """Count cleaning tasks""" + tasks = await self.task_repo.find_tasks(db, status, keyword, None, None) + return len(tasks) + + async def get_task(self, db: AsyncSession, task_id: str) -> CleaningTaskDto: + """Get task by ID""" + task = await self.task_repo.find_task_by_id(db, task_id) + if not task: + raise CleaningTaskNotFoundError(task_id) + + await self._set_process(db, task) + + instances = await self.operator_instance_repo.find_operator_by_instance_id(db, task_id) + + # Batch query operators + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, + is_star=None) + operator_map = {op.id: op for op in all_operators} + + task.instance = [] + for inst in instances: + operator = operator_map.get(inst.operator_id) + if operator: + task.instance.append(OperatorInstanceDto( + id=operator.id, + name=operator.name, + description=operator.description, + inputs=operator.inputs, + outputs=operator.outputs, + settings=operator.settings, + categories=operator.categories, + )) + else: + task.instance.append(OperatorInstanceDto(id=inst.operator_id)) + + return task + + async def create_task( + self, + db: AsyncSession, + request: CreateCleaningTaskRequest + ) -> CleaningTaskDto: + """Create new cleaning task""" + if request.instance and request.template_id: + instances = await self.get_instance_by_template_id(db, request.template_id) + request.instance = instances + + self.validator.check_input_and_output(request.instance) + executor_type = self.validator.check_and_get_executor_type(request.instance) + + task_id = str(uuid.uuid4()) + + dest_dataset_id = request.dest_dataset_id + dest_dataset_name = request.dest_dataset_name + + if not dest_dataset_id: + logger.info(f"Creating new dataset: {dest_dataset_name}, type: {request.dest_dataset_type}") + dest_dataset_response = await self.dataset_service.create_dataset( + name=dest_dataset_name, + dataset_type=request.dest_dataset_type, + description="", + status="ACTIVE" + ) + dest_dataset_id = dest_dataset_response.id + logger.info(f"Successfully created dataset: {dest_dataset_id}") + else: + logger.info(f"Using existing dataset: {dest_dataset_id}") + + src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id) + if not src_dataset: + raise Exception(f"Source dataset not found: {request.src_dataset_id}") + + task_dto = CleaningTaskDto( + id=task_id, + name=request.name, + description=request.description, + status=CleaningTaskStatus.PENDING, + src_dataset_id=request.src_dataset_id, + src_dataset_name=request.src_dataset_name, + dest_dataset_id=dest_dataset_id, + dest_dataset_name=dest_dataset_name, + before_size=src_dataset.totalSize, + file_count=src_dataset.fileCount, + retry_count=-1, + ) + + await self.task_repo.insert_task(db, task_dto) + + await self.operator_instance_repo.insert_instance(db, task_id, request.instance) + + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, is_star=None) + operator_map = {op.id: op for op in all_operators} + + await self.prepare_task(dest_dataset_id, task_id, request.instance, operator_map, executor_type) + + return await self.get_task(db, task_id) + + async def prepare_task( + self, + dataset_id: str, + task_id: str, + instances: List[OperatorInstanceDto], + operator_map: dict, + executor_type: str, + ) -> None: + """Prepare task configuration file""" + process_config = { + "dataset_id": dataset_id, + "instance_id": task_id, + "dataset_path": f"{FLOW_PATH}/{task_id}/dataset.jsonl", + "export_path": f"{DATASET_PATH}/{dataset_id}", + "executor_type": executor_type, + "process": [], + } + + for instance in instances: + operator = operator_map.get(instance.id) + if not operator: + continue + + operator_config = self._get_default_values(operator) + operator_config.update(instance.overrides) + + runtime_config = self._get_runtime_config(operator) + operator_config.update(runtime_config) + + process_config["process"].append({instance.id: operator_config}) + + config_file_path = Path(f"{FLOW_PATH}/{task_id}/process.yaml") + config_file_path.parent.mkdir(parents=True, exist_ok=True) + + import yaml + try: + with open(config_file_path, 'w', encoding='utf-8') as f: + yaml.dump(process_config, f, default_flow_style=False, allow_unicode=True) + except Exception as e: + logger.error(f"Failed to write process.yaml: {e}") + raise FileSystemError(f"Failed to write process.yaml: {e}") + + def _get_default_values(self, operator) -> Dict[str, Any]: + """Get default values from operator settings""" + if not operator.settings: + return {} + + try: + settings = json.loads(operator.settings) + defaults = {} + + for key, value in settings.items(): + setting_type = value.get("type") + if "defaultVal" in value: + defaults[key] = value["defaultVal"] + + return defaults + except json.JSONDecodeError as e: + logger.error(f"Failed to parse settings: {e}") + return {} + + def _get_runtime_config(self, operator) -> Dict[str, Any]: + """Get runtime configuration from operator""" + if not operator.runtime: + return {} + + try: + return json.loads(operator.runtime) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse runtime config: {e}") + return {} + + async def scan_dataset( + self, + db: AsyncSession, + task_id: str, + src_dataset_id: str, + succeed_files: Set[str] | None = None, + ) -> None: + """Scan source dataset and create dataset.jsonl""" + target_file_path = Path(f"{FLOW_PATH}/{task_id}/dataset.jsonl") + target_file_path.parent.mkdir(parents=True, exist_ok=True) + + query = text(""" + SELECT id, file_name, file_path, file_type, file_size + FROM t_dm_dataset_files + WHERE dataset_id = :dataset_id + ORDER BY created_at + """) + + result = await db.execute(query, {"dataset_id": src_dataset_id}) + files = result.fetchall() + + with open(target_file_path, 'w', encoding='utf-8') as f: + for file in files: + if succeed_files and file.id in succeed_files: + continue + + file_info = { + "fileId": file.id, + "fileName": file.file_name, + "filePath": file.file_path, + "fileType": file.file_type, + "fileSize": file.file_size, + } + f.write(json.dumps(file_info, ensure_ascii=False) + "\n") + + async def get_task_results(self, db: AsyncSession, task_id: str) -> List[CleaningResultDto]: + """Get task results""" + return await self.result_repo.find_by_instance_id(db, task_id) + + async def get_task_log(self, db: AsyncSession, task_id: str, retry_count: int) -> List[CleaningTaskLog]: + """Get task log""" + self.validator.check_task_id(task_id) + + log_path = Path(f"{FLOW_PATH}/{task_id}/output.log") + if retry_count > 0: + log_path = Path(f"{FLOW_PATH}/{task_id}/output.log.{retry_count}") + + if not log_path.exists(): + return [] + + logs = [] + last_level = "INFO" + + standard_level_pattern = re.compile( + r"\b(DEBUG|Debug|INFO|Info|WARN|Warn|WARNING|Warning|ERROR|Error|FATAL|Fatal)\b" + ) + exception_suffix_pattern = re.compile(r"\b\w+(Warning|Error|Exception)\b") + + with open(log_path, 'r', encoding='utf-8') as f: + for line in f: + last_level = self._get_log_level(line, last_level, standard_level_pattern, exception_suffix_pattern) + logs.append(CleaningTaskLog(level=last_level, message=line.rstrip())) + + return logs + + def _get_log_level(self, line: str, default_level: str, std_pattern, ex_pattern) -> str: + """Extract log level from log line""" + if not line or not line.strip(): + return default_level + + std_match = std_pattern.search(line) + if std_match: + return std_match.group(1).upper() + + ex_match = ex_pattern.search(line) + if ex_match: + match = ex_match.group(1).upper() + if match == "WARNING": + return "WARN" + if match in ["ERROR", "EXCEPTION"]: + return "ERROR" + + return default_level + + async def delete_task(self, db: AsyncSession, task_id: str) -> None: + """Delete task""" + self.validator.check_task_id(task_id) + + await self.task_repo.delete_task_by_id(db, task_id) + await self.operator_instance_repo.delete_by_instance_id(db, task_id) + await self.result_repo.delete_by_instance_id(db, task_id) + + task_path = Path(f"{FLOW_PATH}/{task_id}") + if task_path.exists(): + try: + shutil.rmtree(task_path) + except Exception as e: + logger.warning(f"Failed to delete task path {task_id}: {e}") + + async def execute_task(self, db: AsyncSession, task_id: str) -> bool: + """Execute task""" + succeeded = await self.result_repo.find_by_instance_id(db, task_id, "COMPLETED") + succeed_set = {res.src_file_id for res in succeeded} + + task = await self.task_repo.find_task_by_id(db, task_id) + if not task: + raise CleaningTaskNotFoundError(task_id) + + await self.scan_dataset(db, task_id, task.src_dataset_id, succeed_set) + await self.result_repo.delete_by_instance_id(db, task_id, "FAILED") + + return await self.scheduler.execute_task(db, task_id, (task.retry_count or 0) + 1) + + async def stop_task(self, db: AsyncSession, task_id: str) -> bool: + """Stop task""" + return await self.scheduler.stop_task(db, task_id) + + async def get_instance_by_template_id( + self, + db: AsyncSession, + template_id: str + ) -> List[OperatorInstanceDto]: + """Get instances by template ID (delegated to template service)""" + instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id) + + # Batch query operators + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, + is_star=None) + operator_map = {op.id: op for op in all_operators} + + result = [] + for inst in instances: + operator = operator_map.get(inst.operator_id) + if operator: + operator_dto = OperatorInstanceDto( + id=operator.id, + name=operator.name, + description=operator.description, + inputs=operator.inputs, + outputs=operator.outputs, + settings=operator.settings, + categories=operator.categories, + ) + if inst.settings_override: + try: + operator_dto.overrides = json.loads(inst.settings_override) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse settings for {inst.operator_id}: {e}") + result.append(operator_dto) + + return result diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py new file mode 100644 index 00000000..8087a36e --- /dev/null +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py @@ -0,0 +1,224 @@ +import json +import uuid +from typing import List + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.logging import get_logger +from app.module.cleaning.schema import ( + CleaningTemplateDto, + CreateCleaningTemplateRequest, + OperatorInstanceDto, +) +from app.module.cleaning.repository import ( + CleaningTemplateRepository, + OperatorInstanceRepository, +) +from app.module.cleaning.service.clean_task_validator import CleanTaskValidator + +logger = get_logger(__name__) + + +class CleaningTemplateService: + """Service for managing cleaning templates""" + + def __init__( + self, + template_repo: CleaningTemplateRepository, + operator_instance_repo: OperatorInstanceRepository, + operator_service, + validator: CleanTaskValidator, + ): + self.template_repo = template_repo + self.operator_instance_repo = operator_instance_repo + self.operator_service = operator_service + self.validator = validator + + async def get_templates( + self, + db: AsyncSession, + keyword: str | None = None + ) -> List[CleaningTemplateDto]: + """Get all templates""" + templates = await self.template_repo.find_all_templates(db, keyword) + + # Collect all operator IDs + template_instances_map = {} + for template in templates: + instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template.id) + template_instances_map[template.id] = instances + + # Batch query all operators + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, + is_star=None) + operator_map = {op.id: op for op in all_operators} + + # Build result + result = [] + for template in templates: + template_dto = CleaningTemplateDto( + id=template.id, + name=template.name, + description=template.description, + instance=[], + created_at=template.created_at, + updated_at=template.updated_at, + ) + + instances = template_instances_map.get(template.id, []) + for inst in instances: + operator = operator_map.get(inst.operator_id) + if operator: + operator_dto = OperatorInstanceDto( + id=operator.id, + name=operator.name, + description=operator.description, + inputs=operator.inputs, + outputs=operator.outputs, + settings=operator.settings, + categories=operator.categories, + ) + if inst.settings_override: + try: + operator_dto.overrides = json.loads(inst.settings_override) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse settings for {inst.operator_id}: {e}") + template_dto.instance.append(operator_dto) + + result.append(template_dto) + + return result + + async def get_template( + self, + db: AsyncSession, + template_id: str + ) -> CleaningTemplateDto: + """Get template by ID""" + template = await self.template_repo.find_template_by_id(db, template_id) + if not template: + raise ValueError(f"Template {template_id} not found") + + template_dto = CleaningTemplateDto( + id=template.id, + name=template.name, + description=template.description, + instance=[], + created_at=template.created_at, + updated_at=template.updated_at, + ) + + instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id) + + # Batch query operators + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, + is_star=None) + operator_map = {op.id: op for op in all_operators} + + for inst in instances: + operator = operator_map.get(inst.operator_id) + if operator: + operator_dto = OperatorInstanceDto( + id=operator.id, + name=operator.name, + description=operator.description, + inputs=operator.inputs, + outputs=operator.outputs, + settings=operator.settings, + categories=operator.categories, + ) + if inst.settings_override: + try: + operator_dto.overrides = json.loads(inst.settings_override) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse settings for {inst.operator_id}: {e}") + template_dto.instance.append(operator_dto) + + return template_dto + + async def create_template( + self, + db: AsyncSession, + request: CreateCleaningTemplateRequest + ) -> CleaningTemplateDto: + """Create new template""" + from app.db.models.cleaning import CleaningTemplate + + self.validator.check_input_and_output(request.instance) + self.validator.check_and_get_executor_type(request.instance) + + template_id = str(uuid.uuid4()) + template = CleaningTemplate( + id=template_id, + name=request.name, + description=request.description, + ) + + await self.template_repo.insert_template(db, template) + + await self.operator_instance_repo.insert_instance(db, template_id, request.instance) + + return await self.get_template(db, template_id) + + async def update_template( + self, + db: AsyncSession, + template_id: str, + request: CreateCleaningTemplateRequest + ) -> CleaningTemplateDto: + """Update template""" + from app.db.models.cleaning import CleaningTemplate + + template = await self.template_repo.find_template_by_id(db, template_id) + if not template: + raise ValueError(f"Template {template_id} not found") + + template.name = request.name + template.description = request.description + + await self.template_repo.update_template(db, template) + await self.operator_instance_repo.delete_by_instance_id(db, template_id) + + await self.operator_instance_repo.insert_instance(db, template_id, request.instance) + + return await self.get_template(db, template_id) + + async def delete_template(self, db: AsyncSession, template_id: str) -> None: + """Delete template""" + await self.template_repo.delete_template(db, template_id) + await self.operator_instance_repo.delete_by_instance_id(db, template_id) + + async def get_instance_by_template_id( + self, + db: AsyncSession, + template_id: str + ) -> List[OperatorInstanceDto]: + """Get operator instances by template ID""" + instances = await self.operator_instance_repo.find_operator_by_instance_id(db, template_id) + + # Batch query operators + all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, + is_star=None) + operator_map = {op.id: op for op in all_operators} + + result = [] + for inst in instances: + operator = operator_map.get(inst.operator_id) + if operator: + operator_dto = OperatorInstanceDto( + id=operator.id, + name=operator.name, + description=operator.description, + inputs=operator.inputs, + outputs=operator.outputs, + settings=operator.settings, + categories=operator.categories, + ) + if inst.settings_override: + try: + operator_dto.overrides = json.loads(inst.settings_override) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse settings for {inst.operator_id}: {e}") + result.append(operator_dto) + + return result diff --git a/runtime/datamate-python/app/module/dataset/schema/__init__.py b/runtime/datamate-python/app/module/dataset/schema/__init__.py index 221c43f8..6a8b0bd0 100644 --- a/runtime/datamate-python/app/module/dataset/schema/__init__.py +++ b/runtime/datamate-python/app/module/dataset/schema/__init__.py @@ -10,6 +10,7 @@ from .dataset import ( DatasetResponse, DatasetTypeResponse, + CreateDatasetRequest, ) __all__ = [ @@ -21,4 +22,5 @@ "BatchUpdateFileTagsResponse", "FileTagUpdateResult", "FileTagUpdate", + "CreateDatasetRequest", ] \ No newline at end of file diff --git a/runtime/datamate-python/app/module/dataset/schema/dataset.py b/runtime/datamate-python/app/module/dataset/schema/dataset.py index 84334d8c..8095857f 100644 --- a/runtime/datamate-python/app/module/dataset/schema/dataset.py +++ b/runtime/datamate-python/app/module/dataset/schema/dataset.py @@ -9,6 +9,7 @@ class DatasetType(Enum): IMAGE = "IMAGE" AUDIO = "AUDIO" VIDEO = "VIDEO" + OTHER = "OTHER" class DatasetTypeResponse(BaseModel): """数据集类型响应模型""" @@ -18,6 +19,16 @@ class DatasetTypeResponse(BaseModel): supportedFormats: List[str] = Field(default_factory=list, description="支持的文件格式") icon: Optional[str] = Field(None, description="图标") +class CreateDatasetRequest(BaseModel): + """创建数据集请求模型""" + name: str = Field(..., description="数据集名称", min_length=1, max_length=100) + description: Optional[str] = Field(None, description="数据集描述", max_length=500) + datasetType: DatasetType = Field(..., description="数据集类型", alias="datasetType") + tags: Optional[List[str]] = Field(None, description="标签列表") + dataSource: Optional[str] = Field(None, description="数据源") + retentionDays: Optional[int] = Field(None, description="保留天数") + status: Optional[str] = Field(None, description="数据集状态") + class DatasetResponse(BaseModel): """DM服务数据集响应模型""" id: str = Field(..., description="数据集ID") diff --git a/runtime/datamate-python/app/module/dataset/service/service.py b/runtime/datamate-python/app/module/dataset/service/service.py index 290bd6f9..3190cc47 100644 --- a/runtime/datamate-python/app/module/dataset/service/service.py +++ b/runtime/datamate-python/app/module/dataset/service/service.py @@ -62,6 +62,84 @@ async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]: logger.error(f"Failed to get dataset {dataset_id}: {e}") return None + async def create_dataset( + self, + name: str, + dataset_type: str, + description: str = "", + status: Optional[str] = None, + ) -> DatasetResponse: + """ + 创建数据集(参考Java版本DatasetApplicationService.createDataset) + + Args: + name: 数据集名称 + dataset_type: 数据集类型(TEXT/IMAGE/VIDEO/AUDIO/OTHER) + description: 数据集描述 + status: 数据集状态 + + Returns: + 创建的数据集响应 + """ + try: + logger.info(f"Creating dataset: {name}, type: {dataset_type}") + + # 1. 检查数据集名称是否已存在 + result = await self.db.execute( + select(Dataset).where(Dataset.name == name) + ) + existing_dataset = result.scalar_one_or_none() + if existing_dataset: + error_msg = f"Dataset with name '{name}' already exists" + logger.error(error_msg) + raise Exception(error_msg) + + # 2. 创建数据集对象 + dataset_id = str(uuid.uuid4()) + dataset_path = f"{os.path.join('/dataset', dataset_id)}" + + # 如果没有提供status,默认为DRAFT + if status is None: + status = "DRAFT" + + new_dataset = Dataset( + id=dataset_id, + name=name, + description=description, + dataset_type=dataset_type, + path=dataset_path, + size_bytes=0, + file_count=0, + status=status, + dataset_metadata="{}", + version=0, + created_by="system", + ) + + self.db.add(new_dataset) + await self.db.flush() + await self.db.commit() + + logger.info(f"Successfully created dataset: {new_dataset.id}") + + return DatasetResponse( + id=new_dataset.id, # type: ignore + name=new_dataset.name, # type: ignore + description=new_dataset.description or "", # type: ignore + datasetType=new_dataset.dataset_type, # type: ignore + status=new_dataset.status, # type: ignore + fileCount=new_dataset.file_count or 0, # type: ignore + totalSize=new_dataset.size_bytes or 0, # type: ignore + createdAt=new_dataset.created_at, # type: ignore + updatedAt=new_dataset.updated_at, # type: ignore + createdBy=new_dataset.created_by # type: ignore + ) + + except Exception as e: + await self.db.rollback() + logger.error(f"Failed to create dataset: {e}") + raise Exception(f"Failed to create dataset: {str(e)}") + async def get_dataset_files( self, dataset_id: str, diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 3ba4fa84..43594e94 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -128,7 +128,8 @@ async def get_operators( ov.file_size, ov.usage_count, ov.created_at, - ov.updated_at + ov.updated_at, + string_agg(ov.category_id, ',' ORDER BY ov.created_at DESC) AS categories FROM v_operator ov {where_clause} {group_by} @@ -143,6 +144,10 @@ async def get_operators( # Convert to DTOs operators = [] for row in rows: + categories_list = [] + if row.categories: + categories_list = [cat_id for cat_id in row.categories.split(',') if cat_id] + operators.append(OperatorDto( id=row.id, name=row.name, @@ -157,6 +162,7 @@ async def get_operators( metrics=None, usage_count=row.usage_count, is_star=row.is_star, + categories=categories_list, created_at=row.created_at, updated_at=row.updated_at, )) @@ -236,7 +242,7 @@ async def get_operator_by_id( operator_id, operator_name, description, version, inputs, outputs, runtime, settings, is_star, file_name, file_size, usage_count, metrics, created_at, updated_at, created_by, updated_by, - STRING_AGG(category_name, ',' ORDER BY created_at DESC) AS categories + string_agg(category_name, ',' ORDER BY created_at DESC) AS categories FROM v_operator WHERE operator_id = :operator_id GROUP BY operator_id, operator_name, description, version, inputs, outputs, runtime, diff --git a/runtime/ops/examples/test_operator/metadata.yml b/runtime/ops/examples/test_operator/metadata.yml index 2320c9ed..fb1b59b8 100644 --- a/runtime/ops/examples/test_operator/metadata.yml +++ b/runtime/ops/examples/test_operator/metadata.yml @@ -22,8 +22,8 @@ metrics: runtime: memory: 10485760 cpu: 0.05 - gpu: 0.1 - npu: 0.1 + gpu: 0 + npu: 0 settings: sliderParam: name: '滑窗测试' diff --git a/runtime/ops/examples/test_operator/test_operator.tar b/runtime/ops/examples/test_operator/test_operator.tar index dc986c1dbf138f28937464c5d57341de1a983a49..e14771ea6b3e689dd3b5842fd26428265a8daad1 100644 GIT binary patch literal 27648 zcmeHPS#w)QcGgT)YATqgq$+vt5-S18k{|#CKv5~VB75YFsw`WP?5WJCR0K?3QHTkG z16;^rN)MnUZW1?HT(nRNwNeW~s MMLcZ>EH(Zb+RVK!A43uf% z_U%61efD$uG(4?G4<7j4;e(!7`;KaTyk2i85MVp-Z*wRp|MmLm8BJb%Z(_cnuPNB< z^?RF}nAhhIGzWGtZ?zU=hnq^Okpz}@D%oaluK12_Cr?(iS`Bh1UfP@&|NdM)fAf#O z+41WgI}S(M*cZpxe=z+c+IIXc{`~m+GycT$J^XFqbKuJ_kLumTh^zSftz+1)toDCs zi^n{X?(R;-)18Q)R=OfxZOV>c{d&i5rsC<}{%gnoI)>IBw-W WDMB0_ap6clV zh{n6+-{*D6f57ke?_g)PbE6x<`i0AXlb6NRXiVAfYYw#p0^TNH(9;z3wKVSyg_>Q# zX7<^kj}ILE E|)g*Oq&|eX76d!b9`nlzm(N`rTet8@%-wI{8E}fnAYwL z@PYoq^$R9QZE{f?$?*FZ3gi8i;Qx|7S4v+mt__rWU-8u^Viq~!56i>axj8 &S~k>M+oh1?bG)&<(6%Z1hFm!6obw=a|r_x-RucqG#HO{7ElVR?vWC(y*lpJHkG zrAykKCBAkZxXI^UW0K;+o5E}+zjl*a_~jKoJ|a*uo57yMkkz$ 9qk!D1S~x6G}|!Qj;F_jEdC>0AmZK^pLUl;@Ws|c32xqmuAvng2JMc9cqn?U)vHa@I;;ShDozM|#DnA3A>j32!Z-e!U3oo^+qa*|_7l`F@?cTjx zGL_%MOqAr8MrrG;-j2!!46#3l8R09RHKbx@j`)Ju_xtorF$)%3`96IUSmr|`g{KSJ z)z!lNtYB`5+(PaGm`+FtkWL$)eywBw8e~K8qgY=Z^Lu=Z-_90h)A_Yot#?5uj_WMr z^V3@96+6pldlnWp_;!|I1^LxW>?{Uj|LiROx9LakiXq{yNK6T1KAl(Pjvyk1M~fTG zNX=)N_Uck`VUmW)*~9HhvMmwqR-^GQ8^p1(;@Z1mF|pmXYueQXUF>a;fieQ#8R_as zLE -%1SS@5k93CF9u|&vb*I#%{2EWG^c}R2 z`vodU4QO+-TDq6j3-z@p8Uxn$L{jb1_@2|zb|v0`iOw?d4#0JG^y~DewP`nAYEz5a z^aH*&&FT|MrxHmjkeD)~Bymmsi>v4JOBd~9d4C_zyfxHlPbAjq(!uO=F}9nzOAn{C zXR|PCeCfQKnrIUc3xOSJpR_9@`IQ+r2{*}&ADkjpiXmB`4onm*1s0m>5`HuLX9mem zMB9>%C7UA!&lpOvboVC2RZBmDN+Cc{pT+d~^0gCWvG$k}izj;aF`xJF$3i!Woz7n^ zN%_X>w(gX8=k>+}IqTRbN4{dZNt#@Q3W63b&dzBIPtn+b{vD#f7wSNM>e#=bKOdO| z9q|&hE_?a8{bdp+36#}Wf`!aq<>%%KkF%xemoAr%fqev5tV9=Zxl!68Z5Q+GW zV{Sfvi;s`PNDGpPy$Up<-S1_RjgotkN*j^O7$Hq*VRXPTqNT+X{HHR?HR*P#m|2A( zV)aQyRij-U$p%ahqeaTou>a(Qu&~hy>RBpAA(Z3|rTfSmF&!J|0&VaOd?C`!g~{R4 z&AYl{mTr_b{ThVNXYc2;-_dNKv2OdN{E!17 1DWO{Km`TB8>1$!K;$plXBdVNFow5i6C$aLP_(CrSJ)%@W8ro zIttGoVsDm~5lV)TP+WjL5eB_D`i759VLrhbI=#r!Kc9pVAf@f5IcWC~O5H@|t`T~~ zta1!yE#18eNH(P{7{CTllbTxq1oKGbmX_~eoObRR@0|fQP4ib=cv#5wmzm|kDZZ59 z7at4zJo60ms6E|+!w4;Ml@hKrnS^q=n3IuIr}~FTr{Mdb*o(=z{NkPB$_f#Y58mXD zE&&yml3p2sSH)t{F2=tbX0gZ_mVbLmhyr7}qtd>Qos4uQ6&)(f4Ih6970+j%q7z19 zPMYvI{76Xj31$+V_w^PtOGIMeP@75f#W5lUrh*Kb121km@cttjm{gT+5^uOo+Wdpk z 8|U!1^#w9!ZKP%QS9BZz&0osIcTQq8pc>$4F08042= zg|28 >^@(fgbCFKr@)c$=Xw7%?ZqVE zu19Q1aRJvLIjlNyItH|zQrf;z+CPhYt$^#?a9%N`v6wnW&pO*lLBp6-*C*wpw3#Q? zKulyjmW`7>sw4+2YMTmkel!BB4C*3mxio}lJ~9a~cKT?yuIHCl(O8;&YY|6|-+tmq zux^2?rJe}pO25i8O|@?DW$(7nM62B{gS-zHPA_ZoxY0- z8UdktZ((4CIqI~Cfql!#&<@pUbGt$J!u+Znz}V?XXG*A-veh}(wJvNw*)1moG@H~q z3eQtv^`zzbu&}S7kV%Y$c)jhkkWj_iU=rX>s!=6rAV-%)+2b?iq)Mcq<&omjk)q3- zbQwQS#lBV%Arp2WLg^Pb$!OkZMgFq38{yu6**zbQt6!jz&99iB%wRAuS=y4wRri zY-@|yYt6H@RhZO~g7|g(43yjb>m)oTX{2c5-L_N50KAcO>>@t&6KSw8Br_lxu%e~( zG}IO!>@QuP0EQge3N4#Rf{s-uzsqK)+-%I-7IPmG@dUgZX{L*-SCFQ#Gq>cOZAYJk zT7f%VnWUG+8z*8Uy%E>S_`b=b`_z<~;F(*6Q8?H3AWg*gmNyj^s-GNSVZDbQAt{kx z%H>zCzy&lNN|B?;J<^8OC^Q#d$=lZi9Ol%_n$e^(J`gd9K3rz!TrT9Z8XGMr&vPP> zysih5IRug-mlKPq3X=8-#q%`+tJZjT81Ao_n1v~L3=0Ju{xsUDDv7X|2^kH5l@TY! znh01VWIc~UuXOvkG%7S-Zk{ {uBX=xq*8b8+$rTpXAp|(ZLUV?@iKq0NU1~MgrZ>VTX)K=m|sFMpvea) zS~Ew8h^O=9F4h=HbR_Xl<2T=mZ~7X9-Aksh#QFx0kyNaAc9kGyPeAgJD~+rqLJYQ_ zo$QQ9)OrHy=}^@Aowg8U=Puz-H;9OnK4LANr1Zv22)40qdL<>_y^DP+{ATB1=`E~3 z*z!*>*zI uGkA1$3FwIe_G3pXBG`=r4Kt1mfV298JWYf?x(yiSLw7Adw zF@rGy>2sm+r25kCyg@7m#zkPsmzF4NE)zkm-zYt-Fh4{=wgoPssHp_vBD?(BVVOPc z>qE3CQ qPP`qmyZQ`vtw2l#WWkPB{qqQG*i%0hfTQ6OkEA6d%&-Qq0 zGL96*DX55g3x-AbItMTaNjVS`y7`4RtdEoG$jT}Qa8yBw1ewr>y)gt8IrOkFIQC~f zLx4O#83_Jh1%$8zz+s7*VK&tv9S0a`5DIlz_=_V4j~@8)i=)Rp38f 0w24`%8ZHtXM@%qQBMhI;#_F_a(QA@ zX!qqwcsa#43w-qgj%g^8qeuSyTp{}iCpPA0H`4ueY X?bZyj z6}>FoyrqZ4*(;!cNE>uVy4oYKIbEq(cMk{v6@YUwq{`q6I1X`ScAF1Q<9JVyN%|dp z`J7M}G9^osijj8jN(AZBU}|r2$kUYIRIOYAH!wBPv%4)Kau1}bg!1WK<9#EDbdgKb z-712=u#6l!QfTm%^ErT- a5C;;<(j22HPr$(ZS`s;`wi-;7m za7xHxVntGS7>z=IcXbew1cr23AXXKjtUPr 7(-W} zY>}x(WP~soiF8R{oE?ZW)A3P(6_HWu$3dAkgYz+Qc&X`wvT)Eu_%M2@u>2TcNYn`u zro*v?IQ<+k&MF>gcb4_TTdAf<_DG>LHkzVG&<#B!y;ho5EDGr(-Mutt6=`mH8lqc` zGF`XJp;Yi}=MJcp!i^;)o`^v$zLJNr plD7S<)zBuuw@3c3>Lh1$wo2SAB8I(i1w#zv VJdYAnN~X^}p3;#`Wue{XW!WhJv;F z-|7?KC!V2tQg3Z+Gx=}w`W^MZO+Ig}{?|fa6~Aqi{dm9h$Ht*XSq`Hj$`BdG%8xCK zvgI^XRJ$Scpz>HVR%oc*9QaY(;c2z!G=3DQ4F_)~|N8yEfIsN>H3vet|JM}s*Y5vq zh|g_(@!|Uaf9Lu8|Nft6AN|X=Znp-k90QjBUp+mrsr-}wZ o^1Yq zvp3Yz6!L~Vz7}t@w e>+Ji z5^pTv@}>x#A3BEk{Cf|7T;6$WH!<`pFJcqc#gxF_U~@>6ZOY3ixI`4hce-eTFLbHI zW!$U~Lv4jZRB~dlVD56^$uuz{t#~swM8P~p;DnF58&%)(E`$NWs{2Ms5@t7Q@9C>7 zJC$F}`gpOz@^W;mAYB%;Z*7I*(6#Rc^{rT;S(mQlkH&Pr@KQfz(Q3e?IGW(}%ygnWT2 zG!(+iva2#YP(i#l+rKqbIJ#F5m+IQ(((RScX0`Z-f{yZ1a+Aras(Ndv(EA#gbE3j5 zE3Ygpw`nS?wA>{3#%0{)EM`@cmx;CD7`i9}&%>xFhgi!iFjQ0SoS&-7-+KYJO7izx zL0J)C88=d4`IfoS(cMV`W1dQNXz-(1J^rEarL^dkQcC58TgHo{yAv;Rl68 L z|NbUlz#;!l_#L`h{?Ec-^}fCP>f08o0F1kD`sFt(#aHcXZ!%tauPbl%dqet G$e4bqNnDnru*}v-SJtd%JG|O;IxEYx{x!Ux(RDhLwQ5)C@ast8UhM~VmCj7m zlwF~#o|gRo3)$1^yp*;5d)>tTuM__Vg2Ml={r>m%)1m@mE&i`S$uFtFivO#p2R4=e ziu(VM-``w||Nk0%XsiFR{r-2r+tSkPul@e_UxNZa?HtvUdTU>s$-lSBS^ppKhrBiW qZ(*={-{dX*kM{dLeox3nSNG|usd{tPAlEQZ!$1uKH4Oap82CRO4E6N@ literal 5077 zcmV;`6DsT RH%U?UiF`1cSe7Ck0iNPPHh9A;!3&EGIAAb~Wm9ZI@-i>;Hm@^3AXj}P)jUu& z&-Y=z?yt|HV|n9ZlMCv?WuJ5U`}*r|@9wX0HXJ)t|H+X<&QMduRyCK)<*%tB?02o- z$A7y#`0T>Jy~OQvdwsPokE_;8Ty9TIZA}GnZ5c(D7LTcN6i7K03#{1>x}7|^#&?_w zF@FEGw7>a>k19T_s5l}A$d|{-ABkw}UB%zQj|YB!f* )0gfuG;m zx_n^1|9v1Fa>|iNi{gw#!>5&2xiz3veDGn#M`Pi{@BXFvzu&+l%WbuGR6hHeA~z|~ zJzM7o*0q1P%Vz&I9*?JjoPFbzlxXj{{d-*`qy|IEes`^ZUrmk6>-IUlKKH)by?%eK z WW%mn3cCkv7(?P2JYUXXwOCHkB5=5}n%6aCYf>HkF{$ H1u z%w_Gy1*Afffck9efkC~MP}<-5YN6+-9QaXgR$eXi()0+tqQeh?v}~$RyOpBL=Ycob z%ySqdKldUxmCP 1o Im}7!}+Oxtv6AaNH94*dX!D|YGW_S(Khu|xRqp=CiBn6 z0O^Q8YB)bqL{$+|dhsTxUViSWc4ZKafD22KcJJQJmCEBJ21~N3L7Y0Nw6d}aK&;P= z1o@Rs>sB!_V^!Z@6BGF~sMzAKiAmrx?H$NHoYSr>#nG2v|%tC NrTjGQAlJ+fXqM}UItQXe@7z%YMu zZubV7=n5c>t#U|dWN1*}d`4g*xqI{LjH^BC%g>D>7@}%YVu5Hdq6WjQ7K}qf`Q^27 zF>LwetJ;-0VfGfxz%oSMBDXfj!TL 4 zb!^30;yp(jmCrN;n;6pSYTEgvFp@+GR97>6%bkU(3}_H(G_9qPphn~#XP7HVf@OpI zP2{_u0+O$kn0f@VDkX~OPb<-wU UH7D!PX> z23sR>HOBu8$5s3rWWnbPM4`@WGgDfkom4XSwI>*2K1+@_1;cwz2b+{|701rL0dk>^ z9>G(jO}qY78=Ke0r|I%Isf;QuiX2nGVv2$i=QZ}vFP+b(E?Qw}S0_!r)ZJ*C9BPpS znmUP)-NaG2JElFFV(vV3-hr=h5?~gLJ3M}9mj|+o6Am f&0uS&KJ{;m>KOMB@~Xf)e*Pr$N}b? z*i2`ypnb#VKqSt-xm+P8#GT~e(XWXJNu%@Ng20RBr)IRdhw!or`Zu%wE^r3~`~NHS zrvp>a89uh!K7L{S7=w@mk` 5x=2#UVgP?`ZQ;+|cgpfwPQ=@Q<%qaCd2eT0~R~nhodR~CrM&Fzp?JrFJA{;YL zoeMsjzLQP=jH7{!MeOIQhY19;>l=Nq7OqKBH8_RiEnqWIQmJJw(Tg+M dN7>vg>LvSA>{o_%?X*-0l`Ru_cMTp8=MfR{!ISh v%-lCxDwq_{-1T&1JfZ!g9|TO(p5Y`z$_BJc4o5wg{sb+2SyilFI>E z&WSlG$6M5I VCk)ZQK zC<+(~Y|sF_xGvzGdk7d)l?a+Q0BW<-h0$dIoErkasP&BjIGgA%H^3T6B}907w}W>^ z4+0r&ZuBaPmx}{H3J48Yp?`$2;6)W$A6MctK{arSR5IJ?d}7sLaKChZ1PIaw?=cT% zs4s^=^<7inHljin@pH5eo2VUXG_E*I3xS+iyD&r;A X!RAwp3X_P7n w>#I+ zcnr}5#J4vt9Ubv%*0D?1BGkh>V8ahU1Hq&ViE(gSw5O|ZZRB+ -n7 zu3xNhAPD!~gLObQ1xdqYNCAz6lI>As?rFWtxHM;3$;;=6+`6lUgBojj>-L!>xt+jN z6U*ZaTS#6LIZ6f=l5E2vB&&)u5RNL&A7Dk)5RNnw1R;&Mcx+@2xbf3qiwb|UkvtqN zUvc7E6Y_YDS?r^T#VE$d)FAVGnOS1n;lOpPx#nfNVvzN6d~6Ue8|2VgB@kB?qK& |t!&4R9sIXY5GyNrs!_PVK%dOx)*;Lomm4g~VNzzU`HQeg!p9kV zgs|diIk}5e%hBc-{Hp%(4Esyaz NMs++mdjH60{U?2N$zI8s?vtOCMjga)9U z)Ny_O9{O{gJ^}0Ck8Q6^)zFoSHFSMeUu1l$LlX4?(4U*zLbqOklLy6xg(Xd;aMzq~ z1U3Aqw2|CwFD|l8bWuf(N5P$2?bnK{Bz-%@mLzx%T(V+t;m+VXbG*w*ZRDk;J93wA zNbMK2fA3(AjuVz%!ka5ide5eCLo5s)?i9F)N)tzw1*RAoz!W1fdK!P=gaw>!SGmSn zw4o32# Vgy_U?Rg$$HQsb_wh!I^qZll4ogEPFLdomdVon4trE_?6US@jM~2= z9EmyOF(vA>kLeU-?U1T$dggPAoC2AbP4}@?6J{28UXn{rfjMGqzT2VQm=GJ{{9|GA zhFB7(FM|YFgo? #YvDc$uk{8g1JhklEe?dMf68+LvkP0IYPi$tjUmA^y1qme-8J z^5H=?18An`)B9Tbypfq;XWT0?$ylMyVmDCV=+d|II3qA35==@kVgg0ncjzyf|88wY zCb0qZgDS47Y$?l464v#QEuw&Mg#6+oZQvKK+PWb9hQsK+49*tvGFTbGa#WIpOGx&! zGt=Qg#uc`s)TQ+%w259ks?>qfxOk?6eYu7E@J2XMjtPbX@i0F=pr2JtYqu7};Vrrn zZhPoZs;f;u!?71-t>T0fh#EE0OJh{F&5dawMvWq0x63AmUB%h~E+uz8MLVCMf|`28 zGb$sW1sN5(2l98@f%(OQv#5i_4G%!CY-#{VV9{tH!481UQ<@?N48r0kC%Q3$uLSmj z+e%&ogq&|^_ka~F%a#mtE@2?UElrRb_b2QeoGG4tEJh^fII0S$8~78xjchu$&3G3x ztrUJ`Cp#o7boUd(r-a%X0`Zs{4uJ>M{fX|-9oCNy!U_Z|^IgMHl|pJdDcW-t7{IK{ z& 1d#;M9tz=|O-fA2OR1d+0Mc{9y6zGm+r zX6?oW@GamSAeGVMN ZuDDFOB0mUcvMOghb9B*wp6pe;)1@*S8IyT-Q|2I`@ zi^kY|{jb;S@t5m=wxyNpf8Xn7*8gsuAJ}~Tug~SHDcApQM=M?b>v8)$e!s6=|NG9j zt&jfe+rQW4vB!V6t6cxPEp4^kkIij=tiIG(dzDeTm0l{S0NkK&Y#3{Ws 7|x_l_! ze^ycP_hxPok^@2o|6H-g+Td?1hX3O~|Lgky)c?Ncr|19p=T8Rqyx!Cy|LA)W?mh9p z|Cf3_1n)pI^Z%{$1MAtp&+oU{zt7|Im-GMaXhr$I&s|$vyVvRV*1A2l?!BAL|AVzn zr-R>kqkE&x8_xfA`;**W?^;7K?bs{lPz+Ir7INp?%*S`1IRTXVp{xw)fa!*XjB< z)Fj9{S0kGf@TMHkZzyNk`slx|{nym^J@)wTb(Q1)wlq;?WxWOCkcz9T*0?R>ko21} z&<~Ft91@s!Q#cAQPQ~RjO3)$UX)nC6
zw}8)L{jp{&W(C^_l(@}e0ow@FUvQ~Jzag=S+YHv8qWt0}P2aV3_JRM7A=$+s{H6$= zAKF0Np1ns76nEa(O^lKkQ)>45YW*(#G73o2XSrx%xmF_Sw<{R9^}YfhQdh&wrQCyY zR5FJsXjxko5rV5C>t0;=ub0GF%lvVMX4xkeeeyqz8hJ)xaY?#aj>wDJH%6hg?+OZy z+VoP@=@OQQ*vmCns~_i|)+iHaC5}e1Q}J|o4fqD9W9e(3j#0k7N&)w(6Dq1eU84qd zl`7OV>QGBmqVfVOYcadQ2P}%$rn)x93S0LT!o}M5-#$Bv%gGIHU}g2jSRwkZ#tNgf zd}VQY$?|f8+SN >ew^5XIhYoo2ZT?E$p z@pUT5e=B-fTnTRKB`-1DFkful?R+r`!);7qSqeF9ezfu9ZD(w6wmBQj;+d@)<2BCz zJvHV1|Ltkz^Z)m{nfrfR=Lgocf3g2>J^y#P%lm)Z(LDP~p8xx6yncWA`2XE+TOa+` zwSSM-U1N{`poiu7za4Gut8bgQ`z9{GnOl5YeeG@YSKh4mhQxpB&_8Be&BrD#H8IFK z`%6Sv8ejQ3-8*!hu6&j5H-43_^cA|d?Mwd}o3p`wDeKLTvHA6Xjn7q%|J&2b>;Lz< znd|?p^8@SJ|BCv5zsFNsUjJ`LD_#Fz Hofzu*0~_0fNQ`*(Tm_5U> Date: Tue, 10 Feb 2026 14:29:27 +0800 Subject: [PATCH 13/20] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86pytho?= =?UTF-8?q?n=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../interface/cleaning_task_routes.py | 20 +++++---- .../interface/cleaning_template_routes.py | 10 ++--- .../cleaning/service/cleaning_task_service.py | 42 +++++++++++++++++++ .../operator/interface/category_routes.py | 2 +- .../operator/interface/operator_routes.py | 16 +++---- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index dae2a96c..dc233f77 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -53,6 +53,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: ) from app.module.cleaning.runtime_client import RuntimeClient from app.module.dataset.service import DatasetManagementService + from app.module.shared.common.lineage import LineageService runtime_client = RuntimeClient() scheduler = CleaningTaskScheduler( @@ -61,6 +62,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: ) operator_service = _get_operator_service() dataset_service = DatasetManagementService(db) + lineage_service = LineageService(db) return CleaningTaskService( task_repo=CleaningTaskRepository(None), @@ -70,6 +72,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: scheduler=scheduler, validator=CleanTaskValidator(), dataset_service=dataset_service, + lineage_service=lineage_service, ) @@ -95,7 +98,7 @@ async def get_cleaning_tasks( total_pages = (count + size - 1) // size if size > 0 else 0 return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( page=page, @@ -128,7 +131,7 @@ async def create_cleaning_task( await task_service.execute_task(db, task.id) await db.commit() - return StandardResponse(code=200, message="success", data=task) + return StandardResponse(code="0", message="success", data=task) except Exception as e: await db.rollback() logger.error(f"Failed to create cleaning task: {e}", exc_info=True) @@ -149,7 +152,7 @@ async def get_cleaning_task( try: task_service = _get_task_service(db) task = await task_service.get_task(db, task_id) - return StandardResponse(code=200, message="success", data=task) + return StandardResponse(code="0", message="success", data=task) except Exception as e: logger.error(f"Failed to get cleaning task {task_id}: {e}", exc_info=True) raise HTTPException(status_code=404, detail=str(e)) @@ -170,7 +173,7 @@ async def delete_cleaning_task( task_service = _get_task_service(db) await task_service.delete_task(db, task_id) await db.commit() - return StandardResponse(code=200, message="success", data=task_id) + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: await db.rollback() logger.error(f"Failed to delete cleaning task {task_id}: {e}", exc_info=True) @@ -191,7 +194,7 @@ async def stop_cleaning_task( try: task_service = _get_task_service(db) await task_service.stop_task(db, task_id) - return StandardResponse(code=200, message="success", data=task_id) + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: logger.error(f"Failed to stop cleaning task {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) @@ -211,7 +214,8 @@ async def execute_cleaning_task( try: task_service = _get_task_service(db) await task_service.execute_task(db, task_id) - return StandardResponse(code=200, message="success", data=task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) except Exception as e: await db.rollback() logger.error(f"Failed to execute cleaning task {task_id}: {e}", exc_info=True) @@ -232,7 +236,7 @@ async def get_cleaning_task_results( try: task_service = _get_task_service(db) results = await task_service.get_task_results(db, task_id) - return StandardResponse(code=200, message="success", data=results) + return StandardResponse(code="0", message="success", data=results) except Exception as e: logger.error(f"Failed to get task results {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) @@ -253,7 +257,7 @@ async def get_cleaning_task_log( try: task_service = _get_task_service(db) logs = await task_service.get_task_log(db, task_id, retry_count) - return StandardResponse(code=200, message="success", data=logs) + return StandardResponse(code="0", message="success", data=logs) except Exception as e: logger.error(f"Failed to get task log {task_id}: {e}", exc_info=True) raise HTTPException(status_code=400, detail=str(e)) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py index 3da722e0..85abbb25 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -93,7 +93,7 @@ async def get_cleaning_templates( total_pages = math.ceil(total / size) if total > 0 else 0 return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( content=items, @@ -124,7 +124,7 @@ async def create_cleaning_template( template = await template_service.create_template(db, request) await db.commit() - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: await db.rollback() logger.error(f"Failed to create cleaning template: {e}", exc_info=True) @@ -146,7 +146,7 @@ async def get_cleaning_template( template_service = _get_template_service(db) template = await template_service.get_template(db, template_id) - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: logger.error(f"Failed to get cleaning template {template_id}: {e}", exc_info=True) raise HTTPException(status_code=404, detail=str(e)) @@ -169,7 +169,7 @@ async def update_cleaning_template( template = await template_service.update_template(db, template_id, request) await db.commit() - return StandardResponse(code=200, message="success", data=template) + return StandardResponse(code="0", message="success", data=template) except Exception as e: await db.rollback() logger.error(f"Failed to update cleaning template {template_id}: {e}", exc_info=True) @@ -191,7 +191,7 @@ async def delete_cleaning_template( template_service = _get_template_service(db) await template_service.delete_template(db, template_id) await db.commit() - return StandardResponse(code=200, message="success", data=template_id) + return StandardResponse(code="0", message="success", data=template_id) except Exception as e: await db.rollback() logger.error(f"Failed to delete cleaning template {template_id}: {e}", exc_info=True) diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py index 8350d179..12ab0c6a 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -31,6 +31,9 @@ CleaningTaskNotFoundError, FileSystemError, ) +from app.module.shared.schema.lineage import NodeType, EdgeType +from app.db.models.base_entity import LineageNode, LineageEdge +from app.module.shared.common.lineage import LineageService logger = get_logger(__name__) @@ -50,6 +53,7 @@ def __init__( scheduler: CleaningTaskScheduler, validator: CleanTaskValidator, dataset_service, + lineage_service: LineageService, ): self.task_repo = task_repo self.result_repo = result_repo @@ -58,6 +62,7 @@ def __init__( self.scheduler = scheduler self.validator = validator self.dataset_service = dataset_service + self.lineage_service = lineage_service async def get_tasks( self, @@ -153,6 +158,7 @@ async def create_task( logger.info(f"Successfully created dataset: {dest_dataset_id}") else: logger.info(f"Using existing dataset: {dest_dataset_id}") + dest_dataset_response = await self.dataset_service.get_dataset(dest_dataset_id) src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id) if not src_dataset: @@ -174,6 +180,8 @@ async def create_task( await self.task_repo.insert_task(db, task_dto) + await self._add_cleaning_to_graph(src_dataset, task_dto, dest_dataset_response) + await self.operator_instance_repo.insert_instance(db, task_id, request.instance) all_operators = await self.operator_service.get_operators(db=db, page=0, size=1000, categories=[], keyword=None, is_star=None) @@ -183,6 +191,40 @@ async def create_task( return await self.get_task(db, task_id) + async def _add_cleaning_to_graph( + self, + src_dataset, + task: CleaningTaskDto, + dest_dataset, + ) -> None: + """ + 添加清洗任务到血缘图 + """ + from_node = LineageNode( + id=src_dataset.id, + node_type=NodeType.DATASET.value, + name=src_dataset.name, + description=src_dataset.description or "", + ) + + to_node = LineageNode( + id=dest_dataset.id, + node_type=NodeType.DATASET.value, + name=dest_dataset.name, + description=dest_dataset.description or "", + ) + + edge = LineageEdge( + process_id=task.id, + name=task.name or "", + description=task.description or "", + edge_type=EdgeType.DATA_CLEANING.value, + from_node_id=from_node.id, + to_node_id=to_node.id, + ) + + await self.lineage_service.generate_graph(from_node, edge, to_node) + async def prepare_task( self, dataset_id: str, diff --git a/runtime/datamate-python/app/module/operator/interface/category_routes.py b/runtime/datamate-python/app/module/operator/interface/category_routes.py index ed4207e0..f4be2b43 100644 --- a/runtime/datamate-python/app/module/operator/interface/category_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/category_routes.py @@ -40,4 +40,4 @@ async def get_category_tree( ): """获取分类树""" result = await service.get_all_categories(db) - return StandardResponse(code=200, message="success", data=result) + return StandardResponse(code="0", message="success", data=result) diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py index ee3b9c78..b5eb0a97 100644 --- a/runtime/datamate-python/app/module/operator/interface/operator_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -73,7 +73,7 @@ async def list_operators( total_pages = (count + request.size - 1) // request.size # Ceiling division return StandardResponse( - code=200, + code="0", message="success", data=PaginatedData( page=request.page, @@ -100,7 +100,7 @@ async def get_operator( try: operator = await service.get_operator_by_id(operator_id, db) operator.file_name = None # Don't return file_name - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) @@ -121,7 +121,7 @@ async def update_operator( try: operator = await service.update_operator(operator_id, request, db) await db.commit() - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: logger.error(f"{operator_id} {request}", e) await db.rollback() @@ -143,7 +143,7 @@ async def create_operator( try: operator = await service.create_operator(request, db) await db.commit() - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) @@ -166,7 +166,7 @@ async def upload_operator( if not file_name: raise HTTPException(status_code=422, detail="fileName is required") operator = await service.upload_operator(file_name, db) - return StandardResponse(code=200, message="success", data=operator) + return StandardResponse(code="0", message="success", data=operator) except Exception as e: logger.error(f"{file_name}", e) raise HTTPException(status_code=400, detail=str(e)) @@ -187,7 +187,7 @@ async def pre_upload( req_id = await service.pre_upload(db) await db.commit() return StandardResponse( - code=200, + code="0", message="success", data=req_id, ) @@ -227,7 +227,7 @@ async def chunk_upload( db=db ) await db.commit() - return StandardResponse(code=200, message="success", data=result.dict()) + return StandardResponse(code="0", message="success", data=result.dict()) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) @@ -248,7 +248,7 @@ async def delete_operator( try: await service.delete_operator(operator_id, db) await db.commit() - return StandardResponse(code=200, message="success", data=None) + return StandardResponse(code="0", message="success", data=None) except Exception as e: await db.rollback() raise HTTPException(status_code=400, detail=str(e)) From 90e40c6fe64f62a4de129004f102033f591df414 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 10 Feb 2026 15:14:11 +0800 Subject: [PATCH 14/20] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86?= =?UTF-8?q?=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/public/config/error-code.json | 31 ++-- .../OperatorMarket/Home/OperatorMarket.tsx | 12 +- .../app/core/exception/codes.py | 20 +++ .../app/core/exception/middleware.py | 2 +- .../app/module/cleaning/__init__.py | 21 --- .../interface/cleaning_task_routes.py | 91 ++++------- .../interface/cleaning_template_routes.py | 126 +++++++--------- .../repository/cleaning_task_repository.py | 8 +- .../cleaning_template_repository.py | 8 +- .../cleaning/service/clean_task_validator.py | 41 ++++- .../cleaning/service/cleaning_task_service.py | 36 ++--- .../service/cleaning_template_service.py | 18 ++- .../operator/interface/operator_routes.py | 142 +++++++----------- .../operator/service/operator_service.py | 14 +- 14 files changed, 264 insertions(+), 306 deletions(-) diff --git a/frontend/public/config/error-code.json b/frontend/public/config/error-code.json index 17137f5e..8270f4db 100644 --- a/frontend/public/config/error-code.json +++ b/frontend/public/config/error-code.json @@ -1,20 +1,25 @@ { + "0": "成功", + "cleaning.0001": "清洗任务不存在", + "cleaning.0002": "清洗任务名称重复", + "cleaning.0003": "清洗模板不存在", + "cleaning.0004": "清洗模板名称重复", + "cleaning.0005": "算子输入输出类型不匹配", + "cleaning.0006": "执行器类型无效", + "cleaning.0007": "数据集不存在", + "cleaning.0008": "文件系统错误", + "cleaning.0009": "设置解析错误", + "cleaning.0010": "任务ID不能为空", + "operator.0001": "算子不存在", + "operator.0002": "算子正在使用中", + "operator.0003": "无法删除预置算子", + "operator.0004": "不支持的文件类型", + "operator.0005": "解析算子包失败", + "operator.0006": "缺少必要的字段", "400": "请求参数错误", "401": "登录已过期,请重新登录", "403": "没有权限访问该资源", "404": "请求的资源不存在", "500": "服务器内部错误,请稍后重试", - "502": "网关错误", - "op.0001": "不支持的文件类型", - "op.0002": "算子中缺少元数据文件", - "op.0003": "缺少必要的字段", - "op.0004": "settings字段解析失败", - "op.0005": "算子ID已存在", - "op.0006": "算子名称已存在", - "op.0007": "算子已被编排在模板或未完成的任务中", - "op.0008": "预置算子无法删除", - "clean.0001": "清洗任务名称重复", - "clean.0002": "任务列表为空", - "clean.0003": "算子输入输出不匹配", - "clean.0004": "算子执行器不匹配" + "502": "网关错误" } \ No newline at end of file diff --git a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx index c22d1bc7..04dcf210 100644 --- a/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx +++ b/frontend/src/pages/OperatorMarket/Home/OperatorMarket.tsx @@ -76,14 +76,10 @@ export default function OperatorMarketPage() { }; const handleDeleteOperator = async (operator: OperatorI) => { - try { - await deleteOperatorByIdUsingDelete(operator.id); - message.success(t("operatorMarket.home.operations.messages.deleteSuccess")); - fetchData(); - await initCategoriesTree(); - } catch (error) { - message.error(t("operatorMarket.home.operations.messages.deleteFailed")); - } + await deleteOperatorByIdUsingDelete(operator.id); + message.success(t("operatorMarket.home.operations.messages.deleteSuccess")); + fetchData(); + await initCategoriesTree(); }; const handleStar = async (operator: OperatorI) => { diff --git a/runtime/datamate-python/app/core/exception/codes.py b/runtime/datamate-python/app/core/exception/codes.py index d741174b..294e6d56 100644 --- a/runtime/datamate-python/app/core/exception/codes.py +++ b/runtime/datamate-python/app/core/exception/codes.py @@ -86,6 +86,26 @@ def __init__(self): RATIO_ALREADY_EXISTS: Final = ErrorCode("ratio.0003", "Task already exists", 400) RATIO_DELETE_FAILED: Final = ErrorCode("ratio.0004", "Failed to delete task", 500) + # ========== 清洗模块 ========== + CLEANING_TASK_NOT_FOUND: Final = ErrorCode("cleaning.0001", "Cleaning task not found", 404) + CLEANING_NAME_DUPLICATED: Final = ErrorCode("cleaning.0002", "Cleaning task name is duplicated", 400) + CLEANING_TEMPLATE_NOT_FOUND: Final = ErrorCode("cleaning.0003", "Cleaning template not found", 404) + CLEANING_TEMPLATE_NAME_DUPLICATED: Final = ErrorCode("cleaning.0004", "Cleaning template name is duplicated", 400) + CLEANING_INVALID_OPERATOR_INPUT: Final = ErrorCode("cleaning.0005", "Invalid operator input/output types", 400) + CLEANING_INVALID_EXECUTOR_TYPE: Final = ErrorCode("cleaning.0006", "Invalid executor type", 400) + CLEANING_DATASET_NOT_FOUND: Final = ErrorCode("cleaning.0007", "Dataset not found", 404) + CLEANING_FILE_SYSTEM_ERROR: Final = ErrorCode("cleaning.0008", "File system error", 500) + CLEANING_SETTINGS_PARSE_ERROR: Final = ErrorCode("cleaning.0009", "Settings parse error", 400) + CLEANING_TASK_ID_REQUIRED: Final = ErrorCode("cleaning.0010", "Task ID is required", 400) + + # ========== 算子市场模块 ========== + OPERATOR_NOT_FOUND: Final = ErrorCode("operator.0001", "Operator not found", 404) + OPERATOR_IN_INSTANCE: Final = ErrorCode("operator.0002", "Operator is in use", 400) + OPERATOR_CANNOT_DELETE_PREDEFINED: Final = ErrorCode("operator.0003", "Cannot delete predefined operator", 400) + OPERATOR_UNSUPPORTED_FILE_TYPE: Final = ErrorCode("operator.0004", "Unsupported file type", 400) + OPERATOR_PARSE_FAILED: Final = ErrorCode("operator.0005", "Failed to parse operator package", 400) + OPERATOR_FIELD_NOT_FOUND: Final = ErrorCode("operator.0006", "Required field is missing", 400) + # ========== 系统模块 ========== SYSTEM_MODEL_NOT_FOUND: Final = ErrorCode("system.0006", "Model configuration not found", 404) SYSTEM_MODEL_HEALTH_CHECK_FAILED: Final = ErrorCode("system.0007", "Model health check failed", 500) diff --git a/runtime/datamate-python/app/core/exception/middleware.py b/runtime/datamate-python/app/core/exception/middleware.py index 82b03ca2..561d130d 100644 --- a/runtime/datamate-python/app/core/exception/middleware.py +++ b/runtime/datamate-python/app/core/exception/middleware.py @@ -69,7 +69,7 @@ async def dispatch(self, request: Request, call_next): except Exception as exc: # 捕获所有未处理的异常 logger.error( - f"Unhandled exception occurred at {request.method} {request.url.path}", + f"Unhandled exception occurred at {request.method} {request.url.path}", exc, exc_info=True ) return self._error_response( diff --git a/runtime/datamate-python/app/module/cleaning/__init__.py b/runtime/datamate-python/app/module/cleaning/__init__.py index 0d35bbc7..da6c0f3a 100644 --- a/runtime/datamate-python/app/module/cleaning/__init__.py +++ b/runtime/datamate-python/app/module/cleaning/__init__.py @@ -11,18 +11,6 @@ UpdateCleaningTemplateRequest, ) -from .exceptions import ( - CleaningException, - CleaningNameDuplicationError, - CleaningTaskNotFoundError, - CleaningTemplateNotFoundError, - InvalidOperatorInputError, - ExecutorTypeError, - DatasetNotFoundError, - FileSystemError, - SettingsParseError, -) - from .repository import ( CleaningTaskRepository, CleaningTemplateRepository, @@ -50,15 +38,6 @@ "CleaningTemplateDto", "CreateCleaningTemplateRequest", "UpdateCleaningTemplateRequest", - "CleaningException", - "CleaningNameDuplicationError", - "CleaningTaskNotFoundError", - "CleaningTemplateNotFoundError", - "InvalidOperatorInputError", - "ExecutorTypeError", - "DatasetNotFoundError", - "FileSystemError", - "SettingsParseError", "CleaningTaskRepository", "CleaningTemplateRepository", "CleaningResultRepository", diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index dc233f77..82cc24af 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -1,6 +1,6 @@ from typing import Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger @@ -64,13 +64,15 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: dataset_service = DatasetManagementService(db) lineage_service = LineageService(db) + task_repo = CleaningTaskRepository(None) + return CleaningTaskService( - task_repo=CleaningTaskRepository(None), + task_repo=task_repo, result_repo=CleaningResultRepository(None), operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, scheduler=scheduler, - validator=CleanTaskValidator(), + validator=CleanTaskValidator(task_repo=task_repo, template_repo=None), dataset_service=dataset_service, lineage_service=lineage_service, ) @@ -122,20 +124,15 @@ async def create_cleaning_task( db: AsyncSession = Depends(get_db), ): """Create cleaning task""" - try: - task_service = _get_task_service(db) + task_service = _get_task_service(db) - task = await task_service.create_task(db, request) - await db.commit() + task = await task_service.create_task(db, request) + await db.commit() - await task_service.execute_task(db, task.id) - await db.commit() + await task_service.execute_task(db, task.id) + await db.commit() - return StandardResponse(code="0", message="success", data=task) - except Exception as e: - await db.rollback() - logger.error(f"Failed to create cleaning task: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + return StandardResponse(code="0", message="success", data=task) @router.get( @@ -149,13 +146,9 @@ async def get_cleaning_task( db: AsyncSession = Depends(get_db), ): """Get cleaning task by ID""" - try: - task_service = _get_task_service(db) - task = await task_service.get_task(db, task_id) - return StandardResponse(code="0", message="success", data=task) - except Exception as e: - logger.error(f"Failed to get cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=404, detail=str(e)) + task_service = _get_task_service(db) + task = await task_service.get_task(db, task_id) + return StandardResponse(code="0", message="success", data=task) @router.delete( @@ -169,15 +162,10 @@ async def delete_cleaning_task( db: AsyncSession = Depends(get_db), ): """Delete cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.delete_task(db, task_id) - await db.commit() - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to delete cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.delete_task(db, task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) @router.post( @@ -191,13 +179,9 @@ async def stop_cleaning_task( db: AsyncSession = Depends(get_db), ): """Stop cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.stop_task(db, task_id) - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - logger.error(f"Failed to stop cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.stop_task(db, task_id) + return StandardResponse(code="0", message="success", data=task_id) @router.post( @@ -211,15 +195,10 @@ async def execute_cleaning_task( db: AsyncSession = Depends(get_db), ): """Execute cleaning task""" - try: - task_service = _get_task_service(db) - await task_service.execute_task(db, task_id) - await db.commit() - return StandardResponse(code="0", message="success", data=task_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to execute cleaning task {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + await task_service.execute_task(db, task_id) + await db.commit() + return StandardResponse(code="0", message="success", data=task_id) @router.get( @@ -233,13 +212,9 @@ async def get_cleaning_task_results( db: AsyncSession = Depends(get_db), ): """Get cleaning task results""" - try: - task_service = _get_task_service(db) - results = await task_service.get_task_results(db, task_id) - return StandardResponse(code="0", message="success", data=results) - except Exception as e: - logger.error(f"Failed to get task results {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + results = await task_service.get_task_results(db, task_id) + return StandardResponse(code="0", message="success", data=results) @router.get( @@ -254,10 +229,6 @@ async def get_cleaning_task_log( db: AsyncSession = Depends(get_db), ): """Get cleaning task log""" - try: - task_service = _get_task_service(db) - logs = await task_service.get_task_log(db, task_id, retry_count) - return StandardResponse(code="0", message="success", data=logs) - except Exception as e: - logger.error(f"Failed to get task log {task_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + task_service = _get_task_service(db) + logs = await task_service.get_task_log(db, task_id, retry_count) + return StandardResponse(code="0", message="success", data=logs) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py index 85abbb25..9c641e62 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -1,7 +1,9 @@ import math from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, Query + +from fastapi import APIRouter, Depends, Query from sqlalchemy import select, func +from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger from app.db.session import get_db @@ -9,11 +11,9 @@ CleaningTemplateDto, CreateCleaningTemplateRequest, UpdateCleaningTemplateRequest, - OperatorInstanceDto, ) from app.module.cleaning.service import CleaningTemplateService from app.module.shared.schema import StandardResponse, PaginatedData -from sqlalchemy.ext.asyncio import AsyncSession logger = get_logger(__name__) @@ -48,15 +48,16 @@ def _get_template_service(db: AsyncSession) -> CleaningTemplateService: CleaningTemplateRepository, OperatorInstanceRepository, ) - from app.db.models.cleaning import CleaningTemplate, OperatorInstance operator_service = _get_operator_service() + template_repo = CleaningTemplateRepository(None) + return CleaningTemplateService( - template_repo=CleaningTemplateRepository(None), + template_repo=template_repo, operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, - validator=CleanTaskValidator(), + validator=CleanTaskValidator(task_repo=None, template_repo=template_repo), ) @@ -73,39 +74,36 @@ async def get_cleaning_templates( db: AsyncSession = Depends(get_db), ): """Query cleaning templates with pagination""" - try: - from app.db.models.cleaning import CleaningTemplate - - template_service = _get_template_service(db) - - query = select(CleaningTemplate) - - if keyword: - keyword_pattern = f"%{keyword}%" - query = query.where( - CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern) - ) - - count_query = select(func.count()).select_from(query.subquery()) - total = (await db.execute(count_query)).scalar_one() - items = await template_service.get_templates(db, keyword) - - total_pages = math.ceil(total / size) if total > 0 else 0 - - return StandardResponse( - code="0", - message="success", - data=PaginatedData( - content=items, - total_elements=total, - total_pages=total_pages, - page=page, - size=size, - ) + from app.db.models.cleaning import CleaningTemplate + + template_service = _get_template_service(db) + + query = select(CleaningTemplate) + + if keyword: + keyword_pattern = f"%{keyword}%" + query = query.where( + CleaningTemplate.name.ilike(keyword_pattern) | CleaningTemplate.description.ilike(keyword_pattern) + ) + + count_query = select(func.count()).select_from(query.subquery()) + total = (await db.execute(count_query)).scalar_one() + + items = await template_service.get_templates(db, keyword) + + total_pages = math.ceil(total / size) if total > 0 else 0 + + return StandardResponse( + code="0", + message="success", + data=PaginatedData( + content=items, + total_elements=total, + total_pages=total_pages, + page=page, + size=size, ) - except Exception as e: - logger.error(f"Failed to get cleaning templates: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + ) @router.post( @@ -119,16 +117,12 @@ async def create_cleaning_template( db: AsyncSession = Depends(get_db), ): """Create cleaning template""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) + + template = await template_service.create_template(db, request) + await db.commit() - template = await template_service.create_template(db, request) - await db.commit() - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - await db.rollback() - logger.error(f"Failed to create cleaning template: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + return StandardResponse(code="0", message="success", data=template) @router.get( @@ -142,14 +136,10 @@ async def get_cleaning_template( db: AsyncSession = Depends(get_db), ): """Get cleaning template by ID""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) - template = await template_service.get_template(db, template_id) - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - logger.error(f"Failed to get cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=404, detail=str(e)) + template = await template_service.get_template(db, template_id) + return StandardResponse(code="0", message="success", data=template) @router.put( @@ -164,16 +154,12 @@ async def update_cleaning_template( db: AsyncSession = Depends(get_db), ): """Update cleaning template""" - try: - template_service = _get_template_service(db) + template_service = _get_template_service(db) - template = await template_service.update_template(db, template_id, request) - await db.commit() - return StandardResponse(code="0", message="success", data=template) - except Exception as e: - await db.rollback() - logger.error(f"Failed to update cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + template = await template_service.update_template(db, template_id, request) + await db.commit() + + return StandardResponse(code="0", message="success", data=template) @router.delete( @@ -187,12 +173,8 @@ async def delete_cleaning_template( db: AsyncSession = Depends(get_db), ): """Delete cleaning template""" - try: - template_service = _get_template_service(db) - await template_service.delete_template(db, template_id) - await db.commit() - return StandardResponse(code="0", message="success", data=template_id) - except Exception as e: - await db.rollback() - logger.error(f"Failed to delete cleaning template {template_id}: {e}", exc_info=True) - raise HTTPException(status_code=400, detail=str(e)) + template_service = _get_template_service(db) + await template_service.delete_template(db, template_id) + await db.commit() + + return StandardResponse(code="0", message="success", data=template_id) diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py index 56b1bfd0..7c83d9a2 100644 --- a/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_task_repository.py @@ -1,6 +1,6 @@ from typing import List, Optional from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy import select, delete +from sqlalchemy import select, delete, func from app.db.models.cleaning import CleaningTask from app.module.cleaning.schema import CleaningTaskDto @@ -132,3 +132,9 @@ async def delete_task_by_id(self, db: AsyncSession, task_id: str) -> None: query = delete(self.model).where(self.model.id == task_id) await db.execute(query) await db.flush() + + async def is_name_exist(self, db: AsyncSession, name: str) -> bool: + """Check if task name exists""" + query = select(func.count()).select_from(self.model).where(self.model.name == name) + result = await db.execute(query) + return result.scalar_one() > 0 if result else False diff --git a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py index b2aab16e..aa35ba71 100644 --- a/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py +++ b/runtime/datamate-python/app/module/cleaning/repository/cleaning_template_repository.py @@ -1,6 +1,6 @@ from typing import List, Optional from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy import select, delete +from sqlalchemy import select, delete, func from app.db.models.cleaning import CleaningTemplate @@ -55,3 +55,9 @@ async def delete_template(self, db: AsyncSession, template_id: str) -> None: query = delete(self.model).where(self.model.id == template_id) await db.execute(query) await db.flush() + + async def is_name_exist(self, db: AsyncSession, name: str) -> bool: + """Check if template name exists""" + query = select(func.count()).select_from(self.model).where(self.model.name == name) + result = await db.execute(query) + return result.scalar_one() > 0 if result else False diff --git a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py index 4ea94464..0c8de701 100644 --- a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py +++ b/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py @@ -1,11 +1,30 @@ -import re +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.exception import BusinessError, ErrorCodes from app.module.cleaning.schema import OperatorInstanceDto -from app.module.cleaning.exceptions import InvalidOperatorInputError, ExecutorTypeError class CleanTaskValidator: """Validator for cleaning tasks and templates""" + def __init__(self, task_repo=None, template_repo=None): + self.task_repo = task_repo + self.template_repo = template_repo + + async def check_task_name_duplication(self, db: AsyncSession, name: str) -> None: + """Check if task name is duplicated""" + if not name: + raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED) + if await self.task_repo.is_name_exist(db, name): + raise BusinessError(ErrorCodes.CLEANING_NAME_DUPLICATED) + + async def check_template_name_duplication(self, db: AsyncSession, name: str) -> None: + """Check if template name is duplicated""" + if not name: + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED) + if await self.template_repo.is_name_exist(db, name): + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NAME_DUPLICATED) + @staticmethod def check_input_and_output(instances: list[OperatorInstanceDto]) -> None: """Validate that operator input/output types are compatible""" @@ -17,16 +36,23 @@ def check_input_and_output(instances: list[OperatorInstanceDto]) -> None: next_op = instances[i + 1] if not current.outputs: - raise InvalidOperatorInputError(f"Operator {current.id} has no outputs defined") + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, + f"Operator {current.id} has no outputs defined" + ) if not next_op.inputs: - raise InvalidOperatorInputError(f"Operator {next_op.id} has no inputs defined") + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, + f"Operator {next_op.id} has no inputs defined" + ) current_outputs = set(current.outputs.split(',')) next_inputs = set(next_op.inputs.split(',')) if not current_outputs.intersection(next_inputs): - raise InvalidOperatorInputError( + raise BusinessError( + ErrorCodes.CLEANING_INVALID_OPERATOR_INPUT, f"Operator {current.id} outputs {current.outputs} " f"but operator {next_op.id} requires {next_op.inputs}" ) @@ -48,7 +74,8 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: executor_types.add("datamate") if len(executor_types) > 1: - raise ExecutorTypeError( + raise BusinessError( + ErrorCodes.CLEANING_INVALID_EXECUTOR_TYPE, "Cannot mix DataMate and DataJuicer operators in same task" ) @@ -58,4 +85,4 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: def check_task_id(task_id: str) -> None: """Validate task ID""" if not task_id: - raise ValueError("Task ID cannot be empty") + raise BusinessError(ErrorCodes.CLEANING_TASK_ID_REQUIRED) diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py index 12ab0c6a..5e25ed24 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -1,16 +1,21 @@ import json -import os -import uuid import re import shutil +import uuid from pathlib import Path from typing import List, Dict, Any, Set -from datetime import datetime -from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession from app.core.logging import get_logger +from app.db.models.base_entity import LineageNode, LineageEdge +from app.core.exception import BusinessError, ErrorCodes +from app.module.cleaning.repository import ( + CleaningTaskRepository, + CleaningResultRepository, + OperatorInstanceRepository, +) from app.module.cleaning.schema import ( CleaningTaskDto, CreateCleaningTaskRequest, @@ -20,20 +25,10 @@ CleaningProcess, CleaningTaskStatus, ) -from app.module.cleaning.repository import ( - CleaningTaskRepository, - CleaningResultRepository, - OperatorInstanceRepository, -) -from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler from app.module.cleaning.service.clean_task_validator import CleanTaskValidator -from app.module.cleaning.exceptions import ( - CleaningTaskNotFoundError, - FileSystemError, -) -from app.module.shared.schema.lineage import NodeType, EdgeType -from app.db.models.base_entity import LineageNode, LineageEdge +from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler from app.module.shared.common.lineage import LineageService +from app.module.shared.schema.lineage import NodeType, EdgeType logger = get_logger(__name__) @@ -99,7 +94,7 @@ async def get_task(self, db: AsyncSession, task_id: str) -> CleaningTaskDto: """Get task by ID""" task = await self.task_repo.find_task_by_id(db, task_id) if not task: - raise CleaningTaskNotFoundError(task_id) + raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id) await self._set_process(db, task) @@ -138,6 +133,7 @@ async def create_task( instances = await self.get_instance_by_template_id(db, request.template_id) request.instance = instances + await self.validator.check_task_name_duplication(db, request.name) self.validator.check_input_and_output(request.instance) executor_type = self.validator.check_and_get_executor_type(request.instance) @@ -162,7 +158,7 @@ async def create_task( src_dataset = await self.dataset_service.get_dataset(request.src_dataset_id) if not src_dataset: - raise Exception(f"Source dataset not found: {request.src_dataset_id}") + raise BusinessError(ErrorCodes.CLEANING_DATASET_NOT_FOUND, request.src_dataset_id) task_dto = CleaningTaskDto( id=task_id, @@ -265,7 +261,7 @@ async def prepare_task( yaml.dump(process_config, f, default_flow_style=False, allow_unicode=True) except Exception as e: logger.error(f"Failed to write process.yaml: {e}") - raise FileSystemError(f"Failed to write process.yaml: {e}") + raise BusinessError(ErrorCodes.CLEANING_FILE_SYSTEM_ERROR, str(e)) def _get_default_values(self, operator) -> Dict[str, Any]: """Get default values from operator settings""" @@ -403,7 +399,7 @@ async def execute_task(self, db: AsyncSession, task_id: str) -> bool: task = await self.task_repo.find_task_by_id(db, task_id) if not task: - raise CleaningTaskNotFoundError(task_id) + raise BusinessError(ErrorCodes.CLEANING_TASK_NOT_FOUND, task_id) await self.scan_dataset(db, task_id, task.src_dataset_id, succeed_set) await self.result_repo.delete_by_instance_id(db, task_id, "FAILED") diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py index 8087a36e..eea48fb5 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py @@ -4,16 +4,18 @@ from sqlalchemy.ext.asyncio import AsyncSession +from app.core.exception import BusinessError, ErrorCodes from app.core.logging import get_logger +from app.module.cleaning import UpdateCleaningTemplateRequest +from app.module.cleaning.repository import ( + CleaningTemplateRepository, + OperatorInstanceRepository, +) from app.module.cleaning.schema import ( CleaningTemplateDto, CreateCleaningTemplateRequest, OperatorInstanceDto, ) -from app.module.cleaning.repository import ( - CleaningTemplateRepository, - OperatorInstanceRepository, -) from app.module.cleaning.service.clean_task_validator import CleanTaskValidator logger = get_logger(__name__) @@ -97,7 +99,7 @@ async def get_template( """Get template by ID""" template = await self.template_repo.find_template_by_id(db, template_id) if not template: - raise ValueError(f"Template {template_id} not found") + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id) template_dto = CleaningTemplateDto( id=template.id, @@ -144,6 +146,7 @@ async def create_template( """Create new template""" from app.db.models.cleaning import CleaningTemplate + await self.validator.check_template_name_duplication(db, request.name) self.validator.check_input_and_output(request.instance) self.validator.check_and_get_executor_type(request.instance) @@ -164,14 +167,13 @@ async def update_template( self, db: AsyncSession, template_id: str, - request: CreateCleaningTemplateRequest + request: UpdateCleaningTemplateRequest ) -> CleaningTemplateDto: """Update template""" - from app.db.models.cleaning import CleaningTemplate template = await self.template_repo.find_template_by_id(db, template_id) if not template: - raise ValueError(f"Template {template_id} not found") + raise BusinessError(ErrorCodes.CLEANING_TEMPLATE_NOT_FOUND, template_id) template.name = request.name template.description = request.description diff --git a/runtime/datamate-python/app/module/operator/interface/operator_routes.py b/runtime/datamate-python/app/module/operator/interface/operator_routes.py index b5eb0a97..4ae78f3a 100644 --- a/runtime/datamate-python/app/module/operator/interface/operator_routes.py +++ b/runtime/datamate-python/app/module/operator/interface/operator_routes.py @@ -4,7 +4,7 @@ """ from typing import Optional -from fastapi import APIRouter, Depends, HTTPException, UploadFile, Form, File, Body +from fastapi import APIRouter, Depends, UploadFile, Form, File, Body from fastapi.responses import FileResponse from app.core.logging import get_logger @@ -30,6 +30,7 @@ router = APIRouter(prefix="/operators", tags=["Operator"]) + def get_operator_service() -> OperatorService: """获取算子服务实例""" return OperatorService( @@ -51,7 +52,7 @@ def get_operator_service() -> OperatorService: async def list_operators( request: OperatorListRequest, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """查询算子列表""" operators = await service.get_operators( @@ -70,7 +71,7 @@ async def list_operators( db=db ) - total_pages = (count + request.size - 1) // request.size # Ceiling division + total_pages = (count + request.size - 1) // request.size return StandardResponse( code="0", @@ -94,15 +95,12 @@ async def list_operators( async def get_operator( operator_id: str, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """获取算子详情""" - try: - operator = await service.get_operator_by_id(operator_id, db) - operator.file_name = None # Don't return file_name - return StandardResponse(code="0", message="success", data=operator) - except ValueError as e: - raise HTTPException(status_code=404, detail=str(e)) + operator = await service.get_operator_by_id(operator_id, db) + operator.file_name = None + return StandardResponse(code="0", message="success", data=operator) @router.put( @@ -115,17 +113,12 @@ async def update_operator( operator_id: str, request: OperatorUpdateDto, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """更新算子""" - try: - operator = await service.update_operator(operator_id, request, db) - await db.commit() - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - logger.error(f"{operator_id} {request}", e) - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + operator = await service.update_operator(operator_id, request, db) + await db.commit() + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -137,16 +130,12 @@ async def update_operator( async def create_operator( request: OperatorDto, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db) ): """创建算子""" - try: - operator = await service.create_operator(request, db) - await db.commit() - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + operator = await service.create_operator(request, db) + await db.commit() + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -158,18 +147,15 @@ async def create_operator( async def upload_operator( request: dict = Body(...), service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """上传算子""" - try: - file_name = request.get("fileName") - if not file_name: - raise HTTPException(status_code=422, detail="fileName is required") - operator = await service.upload_operator(file_name, db) - return StandardResponse(code="0", message="success", data=operator) - except Exception as e: - logger.error(f"{file_name}", e) - raise HTTPException(status_code=400, detail=str(e)) + file_name = request.get("fileName") + if not file_name: + from fastapi import HTTPException + raise HTTPException(status_code=422, detail="fileName is required") + operator = await service.upload_operator(file_name, db) + return StandardResponse(code="0", message="success", data=operator) @router.post( @@ -180,20 +166,16 @@ async def upload_operator( ) async def pre_upload( service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """预上传""" - try: - req_id = await service.pre_upload(db) - await db.commit() - return StandardResponse( - code="0", - message="success", - data=req_id, - ) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + req_id = await service.pre_upload(db) + await db.commit() + return StandardResponse( + code="0", + message="success", + data=req_id, + ) @router.post( @@ -211,26 +193,22 @@ async def chunk_upload( file: UploadFile = File(...), check_sum_hex: Optional[str] = Form(None, alias="checkSumHex", description="校验和"), service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """分块上传""" - try: - file_content = await file.read() - result = await service.chunk_upload( - req_id=req_id, - file_no=file_no, - file_name=file_name, - total_chunk_num=total_chunk_num, - chunk_no=chunk_no, - check_sum_hex=check_sum_hex, - file_content=file_content, - db=db - ) - await db.commit() - return StandardResponse(code="0", message="success", data=result.dict()) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + file_content = await file.read() + result = await service.chunk_upload( + req_id=req_id, + file_no=file_no, + file_name=file_name, + total_chunk_num=total_chunk_num, + chunk_no=chunk_no, + check_sum_hex=check_sum_hex, + file_content=file_content, + db=db + ) + await db.commit() + return StandardResponse(code="0", message="success", data=result.dict()) @router.delete( @@ -242,16 +220,12 @@ async def chunk_upload( async def delete_operator( operator_id: str, service: OperatorService = Depends(get_operator_service), - db=Depends(get_db) + db = Depends(get_db), ): """删除算子""" - try: - await service.delete_operator(operator_id, db) - await db.commit() - return StandardResponse(code="0", message="success", data=None) - except Exception as e: - await db.rollback() - raise HTTPException(status_code=400, detail=str(e)) + await service.delete_operator(operator_id, db) + await db.commit() + return StandardResponse(code="0", message="success", data=None) @router.get( @@ -261,17 +235,15 @@ async def delete_operator( description="下载示例算子文件" ) async def download_example_operator( - service: OperatorService = Depends(get_operator_service) + service: OperatorService = Depends(get_operator_service), ): """下载示例算子""" from app.module.operator.constants import EXAMPLE_OPERATOR_PATH + example_path = EXAMPLE_OPERATOR_PATH - try: - file_path = service.download_example_operator(example_path) - return FileResponse( - path=str(file_path), - filename=file_path.name, - media_type="application/octet-stream" - ) - except FileNotFoundError: - raise HTTPException(status_code=404, detail="Example file not found") + file_path = service.download_example_operator(example_path) + return FileResponse( + path=str(file_path), + filename=file_path.name, + media_type="application/octet-stream" + ) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index 43594e94..bccab373 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -13,6 +13,7 @@ from sqlalchemy import select, text, func from app.core.logging import get_logger +from app.core.exception import BusinessError, ErrorCodes from app.module.operator.repository import ( OperatorRepository, CategoryRelationRepository, @@ -31,11 +32,6 @@ YAML_PATH, SERVICE_ID, ) -from app.module.operator.exceptions import ( - SettingsParseError, - OperatorInInstanceError, - CannotDeletePredefinedOperatorError, -) from app.module.shared.file_service import FileService from app.module.shared.file_models import ( ChunkUploadRequestDto, @@ -254,7 +250,7 @@ async def get_operator_by_id( row = result.fetchone() if not row: - raise ValueError(f"Operator {operator_id} not found") + raise BusinessError(ErrorCodes.OPERATOR_NOT_FOUND, operator_id) # Parse categories from comma-separated string categories_str = row.categories if hasattr(row, 'categories') and row.categories else "" @@ -437,14 +433,14 @@ async def delete_operator( in_template = await self.operator_repo.operator_in_template(operator_id, db) in_unstop_task = await self.operator_repo.operator_in_unstop_task(operator_id, db) if in_template or in_unstop_task: - raise OperatorInInstanceError() + raise BusinessError(ErrorCodes.OPERATOR_IN_INSTANCE) # Check if operator is predefined is_predefined = await self.category_relation_repo.operator_is_predefined( operator_id, db ) if is_predefined: - raise CannotDeletePredefinedOperatorError() + raise BusinessError(ErrorCodes.OPERATOR_CANNOT_DELETE_PREDEFINED) # Get operator for file cleanup operator = await self.get_operator_by_id(operator_id, db) @@ -550,7 +546,7 @@ def _override_settings(self, operator: OperatorDto) -> None: operator.settings = json.dumps(settings) except json.JSONDecodeError as e: - raise SettingsParseError(str(e)) + raise BusinessError(ErrorCodes.OPERATOR_PARSE_FAILED, str(e)) def _convert_to_list_string(self, value: Any) -> str: """转换为逗号分隔的字符串""" From 80df4134dac6ee207410892ccc7d2377d2fbf81c Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 11 Feb 2026 10:44:14 +0800 Subject: [PATCH 15/20] =?UTF-8?q?=E9=80=82=E9=85=8Ddatajuicer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runtime/datamate-python/app/module/cleaning/__init__.py | 4 ++-- .../app/module/cleaning/interface/cleaning_task_routes.py | 4 ++-- .../app/module/cleaning/interface/cleaning_template_routes.py | 4 ++-- .../datamate-python/app/module/cleaning/service/__init__.py | 4 ++-- .../app/module/cleaning/service/cleaning_task_service.py | 4 ++-- .../{clean_task_validator.py => cleaning_task_validator.py} | 4 ++-- .../app/module/cleaning/service/cleaning_template_service.py | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) rename runtime/datamate-python/app/module/cleaning/service/{clean_task_validator.py => cleaning_task_validator.py} (97%) diff --git a/runtime/datamate-python/app/module/cleaning/__init__.py b/runtime/datamate-python/app/module/cleaning/__init__.py index da6c0f3a..7224d83c 100644 --- a/runtime/datamate-python/app/module/cleaning/__init__.py +++ b/runtime/datamate-python/app/module/cleaning/__init__.py @@ -19,7 +19,7 @@ ) from .service import ( - CleanTaskValidator, + CleaningTaskValidator, CleaningTaskScheduler, CleaningTemplateService, CleaningTaskService, @@ -42,7 +42,7 @@ "CleaningTemplateRepository", "CleaningResultRepository", "OperatorInstanceRepository", - "CleanTaskValidator", + "CleaningTaskValidator", "CleaningTaskScheduler", "CleaningTemplateService", "CleaningTaskService", diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index 82cc24af..1f8cba2b 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -44,7 +44,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: """Get cleaning task service instance""" from app.module.cleaning.service import ( CleaningTaskScheduler, - CleanTaskValidator, + CleaningTaskValidator, ) from app.module.cleaning.repository import ( CleaningTaskRepository, @@ -72,7 +72,7 @@ def _get_task_service(db: AsyncSession) -> CleaningTaskService: operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, scheduler=scheduler, - validator=CleanTaskValidator(task_repo=task_repo, template_repo=None), + validator=CleaningTaskValidator(task_repo=task_repo, template_repo=None), dataset_service=dataset_service, lineage_service=lineage_service, ) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py index 9c641e62..102a625e 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_template_routes.py @@ -43,7 +43,7 @@ def _get_operator_service(): def _get_template_service(db: AsyncSession) -> CleaningTemplateService: """Get cleaning template service instance""" - from app.module.cleaning.service import CleanTaskValidator + from app.module.cleaning.service import CleaningTaskValidator from app.module.cleaning.repository import ( CleaningTemplateRepository, OperatorInstanceRepository, @@ -57,7 +57,7 @@ def _get_template_service(db: AsyncSession) -> CleaningTemplateService: template_repo=template_repo, operator_instance_repo=OperatorInstanceRepository(None), operator_service=operator_service, - validator=CleanTaskValidator(task_repo=None, template_repo=template_repo), + validator=CleaningTaskValidator(task_repo=None, template_repo=template_repo), ) diff --git a/runtime/datamate-python/app/module/cleaning/service/__init__.py b/runtime/datamate-python/app/module/cleaning/service/__init__.py index 4a70f7a2..ed305edf 100644 --- a/runtime/datamate-python/app/module/cleaning/service/__init__.py +++ b/runtime/datamate-python/app/module/cleaning/service/__init__.py @@ -1,10 +1,10 @@ -from .clean_task_validator import CleanTaskValidator +from .cleaning_task_validator import CleaningTaskValidator from .cleaning_task_scheduler import CleaningTaskScheduler from .cleaning_template_service import CleaningTemplateService from .cleaning_task_service import CleaningTaskService __all__ = [ - "CleanTaskValidator", + "CleaningTaskValidator", "CleaningTaskScheduler", "CleaningTemplateService", "CleaningTaskService", diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py index 5e25ed24..9886a18d 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_service.py @@ -25,7 +25,7 @@ CleaningProcess, CleaningTaskStatus, ) -from app.module.cleaning.service.clean_task_validator import CleanTaskValidator +from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator from app.module.cleaning.service.cleaning_task_scheduler import CleaningTaskScheduler from app.module.shared.common.lineage import LineageService from app.module.shared.schema.lineage import NodeType, EdgeType @@ -46,7 +46,7 @@ def __init__( operator_instance_repo: OperatorInstanceRepository, operator_service, scheduler: CleaningTaskScheduler, - validator: CleanTaskValidator, + validator: CleaningTaskValidator, dataset_service, lineage_service: LineageService, ): diff --git a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py similarity index 97% rename from runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py rename to runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py index 0c8de701..3d02c884 100644 --- a/runtime/datamate-python/app/module/cleaning/service/clean_task_validator.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py @@ -4,7 +4,7 @@ from app.module.cleaning.schema import OperatorInstanceDto -class CleanTaskValidator: +class CleaningTaskValidator: """Validator for cleaning tasks and templates""" def __init__(self, task_repo=None, template_repo=None): @@ -69,7 +69,7 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: if instance.categories: for category in instance.categories: if "datajuicer" in category.lower(): - executor_types.add("datajuicer") + executor_types.add("default") elif "datamate" in category.lower(): executor_types.add("datamate") diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py index eea48fb5..2443bf4f 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_template_service.py @@ -16,7 +16,7 @@ CreateCleaningTemplateRequest, OperatorInstanceDto, ) -from app.module.cleaning.service.clean_task_validator import CleanTaskValidator +from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator logger = get_logger(__name__) @@ -29,7 +29,7 @@ def __init__( template_repo: CleaningTemplateRepository, operator_instance_repo: OperatorInstanceRepository, operator_service, - validator: CleanTaskValidator, + validator: CleaningTaskValidator, ): self.template_repo = template_repo self.operator_instance_repo = operator_instance_repo From ae8a877fbc842474dd9caabeb8b5fb59d90b0802 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 11 Feb 2026 11:16:30 +0800 Subject: [PATCH 16/20] =?UTF-8?q?=E9=80=82=E9=85=8Ddatajuicer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/pages/DataCleansing/Detail/components/LogsTable.tsx | 2 +- .../src/pages/DataCleansing/Home/components/TemplateList.tsx | 2 +- .../app/module/cleaning/service/cleaning_task_validator.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/frontend/src/pages/DataCleansing/Detail/components/LogsTable.tsx b/frontend/src/pages/DataCleansing/Detail/components/LogsTable.tsx index 280bdec5..48dd13d0 100644 --- a/frontend/src/pages/DataCleansing/Detail/components/LogsTable.tsx +++ b/frontend/src/pages/DataCleansing/Detail/components/LogsTable.tsx @@ -30,7 +30,7 @@ export default function LogsTable({taskLog, fetchTaskLog, retryCount} : {taskLog ))} - {t("dataCleansing.detail.logTable.nthRun", { selectedLog: selectedLog })} + {t("dataCleansing.detail.logTable.nthRun", { num: selectedLog })} diff --git a/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx b/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx index b8d8ada1..f967b50e 100644 --- a/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx +++ b/frontend/src/pages/DataCleansing/Home/components/TemplateList.tsx @@ -39,7 +39,7 @@ export default function TemplateList() { }, { key: "delete", - label: t("dataCleansing.actions.deleteTemplate"), + label: t("dataCleansing.actions.delete"), danger: true, icon:, onClick: deleteTemplate, // implement delete logic diff --git a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py index 3d02c884..32cc47db 100644 --- a/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py +++ b/runtime/datamate-python/app/module/cleaning/service/cleaning_task_validator.py @@ -2,6 +2,7 @@ from app.core.exception import BusinessError, ErrorCodes from app.module.cleaning.schema import OperatorInstanceDto +from app.module.operator.constants import CATEGORY_DATA_JUICER_ID, CATEGORY_DATAMATE_ID class CleaningTaskValidator: @@ -68,9 +69,9 @@ def check_and_get_executor_type(instances: list[OperatorInstanceDto]) -> str: for instance in instances: if instance.categories: for category in instance.categories: - if "datajuicer" in category.lower(): + if CATEGORY_DATA_JUICER_ID in category.lower(): executor_types.add("default") - elif "datamate" in category.lower(): + elif CATEGORY_DATAMATE_ID in category.lower(): executor_types.add("datamate") if len(executor_types) > 1: From 56de047ece8f2391dd2c4d0496194f0d0542cc7a Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 11 Feb 2026 11:49:57 +0800 Subject: [PATCH 17/20] =?UTF-8?q?=E9=80=82=E9=85=8Ddatajuicer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/module/operator/service/operator_service.py | 7 +++---- runtime/ops/pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/runtime/datamate-python/app/module/operator/service/operator_service.py b/runtime/datamate-python/app/module/operator/service/operator_service.py index bccab373..6314f221 100644 --- a/runtime/datamate-python/app/module/operator/service/operator_service.py +++ b/runtime/datamate-python/app/module/operator/service/operator_service.py @@ -6,6 +6,7 @@ import os import uuid import shutil +from datetime import datetime from pathlib import Path from typing import List, Optional, Dict, Any, TYPE_CHECKING @@ -306,7 +307,6 @@ async def create_operator( db: AsyncSession ) -> OperatorDto: """创建算子""" - from datetime import datetime, timezone # Generate ID if not provided if not req.id: @@ -330,7 +330,7 @@ async def create_operator( release = req.releases[0] release.id = req.id release.version = req.version - release.release_date = datetime.utcnow() + release.release_date = datetime.now() await self.operator_release_repo.insert(release, db) # Extract files @@ -350,7 +350,6 @@ async def update_operator( db: AsyncSession ) -> OperatorDto: """更新算子""" - from datetime import datetime, timezone # Get existing operator existing = await self.get_operator_by_id(operator_id, db) @@ -406,7 +405,7 @@ async def update_operator( release = req.releases[0] release.id = operator_id release.version = req.version - release.release_date = datetime.utcnow() + release.release_date = datetime.now() if original_version == release.version: await self.operator_release_repo.update(release, db) else: diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml index dd8271d1..11d6bb11 100644 --- a/runtime/ops/pyproject.toml +++ b/runtime/ops/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "openslide-python>=1.4.3", "paddleocr==3.3.0", "paddlepaddle==3.2.2", - "pandas>=2.2.3", + "pandas>=2.2.3,<3.0.0", "presidio-analyzer==2.2.25", "presidio-anonymizer==2.2.25", "pycryptodome>=3.23.0", From e9955fb2506227b68c97ea0f627b1270d2b690cd Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 11 Feb 2026 14:54:17 +0800 Subject: [PATCH 18/20] =?UTF-8?q?=E9=80=82=E9=85=8Ddatajuicer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datamate/wrappers/data_juicer_executor.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py index d1a57125..975763d0 100644 --- a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py +++ b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py @@ -14,6 +14,7 @@ from datamate.core.base_op import FileExporter, SUCCESS_STATUS from datamate.core.constant import Fields from datamate.wrappers.executor import RayExecutor +from datamate.sql_manager.persistence_atction import TaskInfoPersistence DJ_OUTPUT = "outputs" @@ -103,6 +104,10 @@ def run(self): logger.info('Read data...') dataset = dataset.map(FileExporter().read_file, num_cpus=0.05) + # 保存原始数据文件ID集合,用于后续过滤数据检测 + original_file_ids = set(dataset.unique("fileId")) + + # 写入数据集文件 with open(self.dataset_path, "w", encoding="utf-8") as f: for batch_df in dataset.iter_batches(batch_format="pandas", batch_size=2048): batch_df.to_json(f, orient="records", lines=True, force_ascii=False) @@ -118,6 +123,21 @@ def run(self): processed_dataset = processed_dataset.map(FileExporter().save_file_and_db, num_cpus=0.05) for _ in processed_dataset.iter_batches(): pass + + # 特殊处理:识别被过滤的数据 + processed_file_ids = set(processed_dataset.unique("fileId")) + filtered_file_ids = original_file_ids - processed_file_ids + + if filtered_file_ids: + logger.info(f"Found {len(filtered_file_ids)} filtered files, updating task result only") + for sample_dict in dataset.iter_batches(batch_format="pandas", batch_size=2048): + for _, row in sample_dict.iterrows(): + if str(row.get("fileId", "")) in filtered_file_ids: + sample_dict["fileSize"] = "0" + sample_dict["fileType"] = "" + TaskInfoPersistence().update_task_result(sample_dict) + + self.scan_files() except Exception as e: logger.error(f"An unexpected error occurred.", e) raise e From 814e6934e895c6d01e18ac9a878a7b6b4680ac2a Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Wed, 11 Feb 2026 15:18:51 +0800 Subject: [PATCH 19/20] =?UTF-8?q?=E9=80=82=E9=85=8Ddatajuicer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../datamate/wrappers/data_juicer_executor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py index 975763d0..6d345f4b 100644 --- a/runtime/python-executor/datamate/wrappers/data_juicer_executor.py +++ b/runtime/python-executor/datamate/wrappers/data_juicer_executor.py @@ -125,7 +125,10 @@ def run(self): pass # 特殊处理:识别被过滤的数据 - processed_file_ids = set(processed_dataset.unique("fileId")) + if processed_dataset.count() == 0: + processed_file_ids = set() + else: + processed_file_ids = set(processed_dataset.unique("fileId")) filtered_file_ids = original_file_ids - processed_file_ids if filtered_file_ids: @@ -133,9 +136,11 @@ def run(self): for sample_dict in dataset.iter_batches(batch_format="pandas", batch_size=2048): for _, row in sample_dict.iterrows(): if str(row.get("fileId", "")) in filtered_file_ids: - sample_dict["fileSize"] = "0" - sample_dict["fileType"] = "" - TaskInfoPersistence().update_task_result(sample_dict) + row["fileSize"] = "0" + row["fileType"] = "" + row["execute_status"] = SUCCESS_STATUS + row[Fields.instance_id] = self.cfg.instance_id + TaskInfoPersistence().update_task_result(row) self.scan_files() except Exception as e: From 2cc6591c796aa47c992e9772e4763da4c0e0ab67 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Thu, 12 Feb 2026 09:35:46 +0800 Subject: [PATCH 20/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E8=BF=81=E7=A7=BB?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gateway/ApiGatewayApplication.java | 5 +- backend/openapi/README.md | 2 - .../services/data-cleaning-service/pom.xml | 89 ---- .../DataCleaningServiceConfiguration.java | 18 - .../application/CleaningTaskService.java | 417 ------------------ .../application/CleaningTemplateService.java | 117 ----- .../scheduler/CleaningTaskScheduler.java | 44 -- .../common/enums/CleaningTaskStatusEnum.java | 39 -- .../cleaning/common/enums/ExecutorType.java | 25 -- .../common/exception/CleanErrorCode.java | 23 - .../cleaning/domain/model/TaskProcess.java | 24 - .../domain/model/entity/CleaningResult.java | 32 -- .../domain/model/entity/CleaningTask.java | 45 -- .../domain/model/entity/CleaningTemplate.java | 26 -- .../domain/model/entity/OperatorInstance.java | 18 - .../model/entity/TemplateWithInstance.java | 27 -- .../repository/CleaningResultRepository.java | 20 - .../repository/CleaningTaskRepository.java | 21 - .../CleaningTemplateRepository.java | 20 - .../OperatorInstanceRepository.java | 18 - .../converter/CleaningResultConverter.java | 15 - .../converter/CleaningTaskConverter.java | 19 - .../converter/CleaningTemplateConverter.java | 15 - .../converter/OperatorInstanceConverter.java | 72 --- .../httpclient/RuntimeClient.java | 68 --- .../Impl/CleaningResultRepositoryImpl.java | 58 --- .../Impl/CleaningTaskRepositoryImpl.java | 65 --- .../Impl/CleaningTemplateRepositoryImpl.java | 56 --- .../Impl/OperatorInstanceRepositoryImpl.java | 53 --- .../mapper/CleaningResultMapper.java | 11 - .../mapper/CleaningTaskMapper.java | 9 - .../mapper/CleaningTemplateMapper.java | 22 - .../mapper/OperatorInstanceMapper.java | 26 -- .../validator/CleanTaskValidator.java | 84 ---- .../interfaces/dto/CleaningProcess.java | 52 --- .../interfaces/dto/CleaningResultDto.java | 30 -- .../interfaces/dto/CleaningTaskDto.java | 60 --- .../interfaces/dto/CleaningTaskLog.java | 12 - .../interfaces/dto/CleaningTemplateDto.java | 34 -- .../dto/CreateCleaningTaskRequest.java | 46 -- .../dto/CreateCleaningTemplateRequest.java | 23 - .../interfaces/dto/OperatorInstanceDto.java | 37 -- .../dto/UpdateCleaningTemplateRequest.java | 26 -- .../rest/CleaningTaskController.java | 87 ---- .../rest/CleaningTemplateController.java | 78 ---- backend/services/main-application/pom.xml | 10 - .../services/operator-market-service/pom.xml | 81 ---- .../OperatorMarketServiceConfiguration.java | 23 - .../operator/application/CategoryService.java | 68 --- .../operator/application/OperatorService.java | 305 ------------- .../domain/contants/OperatorConstant.java | 54 --- .../operator/domain/model/Category.java | 24 - .../domain/model/CategoryRelation.java | 16 - .../operator/domain/model/Operator.java | 38 -- .../domain/model/OperatorRelease.java | 24 - .../operator/domain/model/OperatorView.java | 47 -- .../CategoryRelationRepository.java | 20 - .../domain/repository/CategoryRepository.java | 11 - .../repository/OperatorReleaseRepository.java | 17 - .../domain/repository/OperatorRepository.java | 23 - .../repository/OperatorViewRepository.java | 16 - .../converter/CategoryConverter.java | 15 - .../converter/CategoryRelationConverter.java | 15 - .../converter/OperatorConverter.java | 39 -- .../converter/OperatorReleaseConverter.java | 17 - .../exception/OperatorErrorCode.java | 27 -- .../infrastructure/parser/AbstractParser.java | 89 ---- .../infrastructure/parser/ParserHolder.java | 64 --- .../infrastructure/parser/TarParser.java | 77 ---- .../infrastructure/parser/ZipParser.java | 77 ---- .../Impl/CategoryRelationRepositoryImpl.java | 60 --- .../Impl/CategoryRepositoryImpl.java | 24 - .../Impl/OperatorReleaseRepositoryImpl.java | 46 -- .../Impl/OperatorRepositoryImpl.java | 65 --- .../Impl/OperatorViewRepositoryImpl.java | 86 ---- .../persistence/mapper/CategoryMapper.java | 11 - .../mapper/CategoryRelationMapper.java | 11 - .../persistence/mapper/OperatorMapper.java | 21 - .../mapper/OperatorReleaseMapper.java | 11 - .../mapper/OperatorViewMapper.java | 33 -- .../operator/interfaces/dto/CategoryDto.java | 24 - .../interfaces/dto/CategoryRelationDto.java | 12 - .../dto/CategoryTreePagedResponse.java | 24 - .../interfaces/dto/CategoryTreeResponse.java | 22 - .../operator/interfaces/dto/LabelDto.java | 13 - .../operator/interfaces/dto/OperatorDto.java | 60 --- .../interfaces/dto/OperatorReleaseDto.java | 19 - .../dto/OperatorsListPostRequest.java | 31 -- .../interfaces/dto/UploadOperatorRequest.java | 34 -- .../interfaces/rest/CategoryController.java | 32 -- .../interfaces/rest/OperatorController.java | 92 ---- backend/services/pom.xml | 2 - .../operator/interface/category_routes.py | 24 +- .../repository/category_repository.py | 2 +- .../app/module/operator/schema/category.py | 6 +- .../operator/service/category_service.py | 2 +- 96 files changed, 29 insertions(+), 4023 deletions(-) delete mode 100644 backend/services/data-cleaning-service/pom.xml delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/DataCleaningServiceConfiguration.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTemplateService.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/scheduler/CleaningTaskScheduler.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/enums/CleaningTaskStatusEnum.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/enums/ExecutorType.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/common/exception/CleanErrorCode.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/TaskProcess.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningResult.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTask.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/CleaningTemplate.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/OperatorInstance.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/model/entity/TemplateWithInstance.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/repository/CleaningResultRepository.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/repository/CleaningTaskRepository.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/repository/CleaningTemplateRepository.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/domain/repository/OperatorInstanceRepository.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/converter/CleaningResultConverter.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/converter/CleaningTaskConverter.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/converter/CleaningTemplateConverter.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/converter/OperatorInstanceConverter.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/httpclient/RuntimeClient.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/Impl/CleaningResultRepositoryImpl.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/Impl/CleaningTaskRepositoryImpl.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/Impl/CleaningTemplateRepositoryImpl.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/Impl/OperatorInstanceRepositoryImpl.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningResultMapper.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTaskMapper.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/CleaningTemplateMapper.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/persistence/mapper/OperatorInstanceMapper.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CleaningProcess.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CleaningResultDto.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CleaningTaskDto.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CleaningTaskLog.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CleaningTemplateDto.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CreateCleaningTaskRequest.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/CreateCleaningTemplateRequest.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/OperatorInstanceDto.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/dto/UpdateCleaningTemplateRequest.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/rest/CleaningTaskController.java delete mode 100644 backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/interfaces/rest/CleaningTemplateController.java delete mode 100644 backend/services/operator-market-service/pom.xml delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/OperatorMarketServiceConfiguration.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/application/CategoryService.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/application/OperatorService.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/contants/OperatorConstant.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Category.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/CategoryRelation.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/Operator.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/OperatorRelease.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/model/OperatorView.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/repository/CategoryRelationRepository.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/repository/CategoryRepository.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/repository/OperatorReleaseRepository.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/repository/OperatorRepository.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/domain/repository/OperatorViewRepository.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/converter/CategoryConverter.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/converter/CategoryRelationConverter.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/converter/OperatorConverter.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/converter/OperatorReleaseConverter.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/exception/OperatorErrorCode.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/parser/AbstractParser.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/parser/ParserHolder.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/parser/TarParser.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/parser/ZipParser.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/CategoryRelationRepositoryImpl.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/CategoryRepositoryImpl.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorReleaseRepositoryImpl.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorRepositoryImpl.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/Impl/OperatorViewRepositoryImpl.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryMapper.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/CategoryRelationMapper.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorMapper.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorReleaseMapper.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/infrastructure/persistence/mapper/OperatorViewMapper.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/CategoryDto.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/CategoryRelationDto.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/CategoryTreePagedResponse.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/CategoryTreeResponse.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/LabelDto.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/OperatorDto.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/OperatorReleaseDto.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/OperatorsListPostRequest.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/dto/UploadOperatorRequest.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/rest/CategoryController.java delete mode 100644 backend/services/operator-market-service/src/main/java/com/datamate/operator/interfaces/rest/OperatorController.java diff --git a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java index ba163782..687645bc 100644 --- a/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java +++ b/backend/api-gateway/src/main/java/com/datamate/gateway/ApiGatewayApplication.java @@ -46,7 +46,10 @@ public RouteLocator customRouteLocator(RouteLocatorBuilder builder) { .uri("http://datamate-backend-python:18000")) // 数据评估服务路由 - .route("data-operator", r -> r.path("/api/operators/**", "api/categories/**") + .route("data-operator", r -> r.path("/api/operators/**") + .uri("http://datamate-backend-python:18000")) + + .route("data-categories", r -> r.path("/api/categories/**") .uri("http://datamate-backend-python:18000")) .route("data-cleaning", r -> r.path("/api/cleaning/**") diff --git a/backend/openapi/README.md b/backend/openapi/README.md index 18fbe63d..d03f1cc4 100644 --- a/backend/openapi/README.md +++ b/backend/openapi/README.md @@ -126,8 +126,6 @@ OPENAPI_DIR="openapi/specs" SERVICES=( "data-annotation-service" "data-management-service" - "operator-market-service" - "data-cleaning-service" "data-synthesis-service" "data-evaluation-service" "pipeline-orchestration-service" diff --git a/backend/services/data-cleaning-service/pom.xml b/backend/services/data-cleaning-service/pom.xml deleted file mode 100644 index c1fa2c9b..00000000 --- a/backend/services/data-cleaning-service/pom.xml +++ /dev/null @@ -1,89 +0,0 @@ - - - - - diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/DataCleaningServiceConfiguration.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/DataCleaningServiceConfiguration.java deleted file mode 100644 index 1ea5ad09..00000000 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/DataCleaningServiceConfiguration.java +++ /dev/null @@ -1,18 +0,0 @@ -package com.datamate.cleaning; - -import org.springframework.context.annotation.ComponentScan; -import org.springframework.scheduling.annotation.EnableAsync; -import org.springframework.scheduling.annotation.EnableScheduling; - -/** - * 数据归集服务配置类 - * 基于DataX的数据归集和同步服务,支持多种数据源的数据采集和归集 - */ -@EnableAsync -@EnableScheduling -@ComponentScan(basePackages = { - "com.datamate.cleaning" -}) -public class DataCleaningServiceConfiguration { - // Configuration class for JAR packaging - no main method needed -} diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java deleted file mode 100644 index 461e8809..00000000 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java +++ /dev/null @@ -1,417 +0,0 @@ -package com.datamate.cleaning.application; - - -import com.datamate.cleaning.application.scheduler.CleaningTaskScheduler; -import com.datamate.cleaning.common.enums.CleaningTaskStatusEnum; -import com.datamate.cleaning.common.enums.ExecutorType; -import com.datamate.cleaning.domain.model.TaskProcess; -import com.datamate.cleaning.domain.repository.CleaningResultRepository; -import com.datamate.cleaning.domain.repository.CleaningTaskRepository; -import com.datamate.cleaning.domain.repository.OperatorInstanceRepository; -import com.datamate.cleaning.infrastructure.validator.CleanTaskValidator; -import com.datamate.cleaning.interfaces.dto.*; -import com.datamate.common.domain.enums.EdgeType; -import com.datamate.common.domain.enums.NodeType; -import com.datamate.common.domain.model.LineageEdge; -import com.datamate.common.domain.model.LineageNode; -import com.datamate.common.domain.service.LineageService; -import com.datamate.common.infrastructure.exception.BusinessException; -import com.datamate.common.infrastructure.exception.SystemErrorCode; -import com.datamate.common.interfaces.PagedResponse; -import com.datamate.common.interfaces.PagingQuery; -import com.datamate.datamanagement.application.DatasetApplicationService; -import com.datamate.datamanagement.application.DatasetFileApplicationService; -import com.datamate.datamanagement.common.enums.DatasetType; -import com.datamate.datamanagement.domain.model.dataset.Dataset; -import com.datamate.datamanagement.domain.model.dataset.DatasetFile; -import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest; -import com.datamate.operator.domain.repository.OperatorRepository; -import com.datamate.operator.infrastructure.exception.OperatorErrorCode; -import com.datamate.operator.interfaces.dto.OperatorDto; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.PropertyNamingStrategies; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; -import org.yaml.snakeyaml.DumperOptions; -import org.yaml.snakeyaml.Yaml; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.*; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -@Slf4j -@Service -@RequiredArgsConstructor -public class CleaningTaskService { - private final CleaningTaskRepository cleaningTaskRepo; - - private final OperatorInstanceRepository operatorInstanceRepo; - - private final OperatorRepository operatorRepo; - - private final CleaningResultRepository cleaningResultRepo; - - private final CleaningTaskScheduler taskScheduler; - - private final DatasetApplicationService datasetService; - - private final DatasetFileApplicationService datasetFileService; - - private final CleanTaskValidator cleanTaskValidator; - - private final LineageService lineageService; - - private final String DATASET_PATH = "/dataset"; - - private final String FLOW_PATH = "/flow"; - - private static final Pattern STANDARD_LEVEL_PATTERN = Pattern.compile( - "\\b(DEBUG|Debug|INFO|Info|WARN|Warn|WARNING|Warning|ERROR|Error|FATAL|Fatal)\\b" - ); - - private static final Pattern EXCEPTION_SUFFIX_PATTERN = Pattern.compile( - "\\b\\w+(Warning|Error|Exception)\\b" - ); - - private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public List4.0.0 - -- - -com.datamate -services -1.0.0-SNAPSHOT -../pom.xml -data-cleaning-service -Data Cleaning Service -数据清洗服务 - -- - -- -com.datamate -domain-common -${project.version} -- -com.datamate -data-management-service -${project.version} -- -com.datamate -operator-market-service -${project.version} -- -org.springframework.boot -spring-boot-starter-test -test -- -org.springframework.boot -spring-boot-starter-web -- -org.springdoc -springdoc-openapi-starter-webmvc-ui -- -org.projectlombok -lombok -- -org.openapitools -jackson-databind-nullable -- - -org.apache.commons -commons-compress -1.26.1 -- - -org.mapstruct -mapstruct -- -org.mapstruct -mapstruct-processor -${mapstruct.version} -provided -- -org.springframework.data -spring-data-commons -- -- -- -org.springframework.boot -spring-boot-maven-plugin -getTasks(String status, String keywords, Integer page, Integer size) { - List tasks = cleaningTaskRepo.findTasks(status, keywords, page, size); - tasks.forEach(this::setProcess); - return tasks; - } - - private void setProcess(CleaningTaskDto task) { - int[] count = cleaningResultRepo.countByInstanceId(task.getId()); - task.setProgress(CleaningProcess.of(task.getFileCount(), count[0], count[1])); - } - - public int countTasks(String status, String keywords) { - return cleaningTaskRepo.findTasks(status, keywords, null, null).size(); - } - - @Transactional - public CleaningTaskDto createTask(CreateCleaningTaskRequest request) { - cleanTaskValidator.checkNameDuplication(request.getName()); - cleanTaskValidator.checkInputAndOutput(request.getInstance()); - - ExecutorType executorType = cleanTaskValidator.checkAndGetExecutorType(request.getInstance()); - - Dataset destDataset; - if (StringUtils.isNotBlank(request.getDestDatasetId())) { - destDataset = datasetService.getDataset(request.getDestDatasetId()); - } else { - CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest(); - createDatasetRequest.setName(request.getDestDatasetName()); - createDatasetRequest.setDatasetType(DatasetType.valueOf(request.getDestDatasetType())); - createDatasetRequest.setStatus("ACTIVE"); - destDataset = datasetService.createDataset(createDatasetRequest); - } - Dataset srcDataset = datasetService.getDataset(request.getSrcDatasetId()); - - CleaningTaskDto task = new CleaningTaskDto(); - task.setName(request.getName()); - task.setDescription(request.getDescription()); - task.setStatus(CleaningTaskStatusEnum.PENDING); - String taskId = UUID.randomUUID().toString(); - task.setId(taskId); - task.setSrcDatasetId(request.getSrcDatasetId()); - task.setSrcDatasetName(request.getSrcDatasetName()); - task.setDestDatasetId(destDataset.getId()); - task.setDestDatasetName(destDataset.getName()); - task.setBeforeSize(srcDataset.getSizeBytes()); - task.setFileCount(srcDataset.getFileCount().intValue()); - cleaningTaskRepo.insertTask(task); - // 记录血缘关系 - addCleaningToGraph(srcDataset, task, destDataset); - - operatorInstanceRepo.insertInstance(taskId, request.getInstance()); - operatorRepo.incrementUsageCount(request.getInstance().stream() - .map(OperatorInstanceDto::getId) - .collect(Collectors.toList())); - - prepareTask(task, request.getInstance(), executorType); - scanDataset(taskId, request.getSrcDatasetId()); - - return task; - } - - private void addCleaningToGraph(Dataset srcDataset, CleaningTaskDto task, Dataset destDataset) { - LineageNode fromNode = new LineageNode(); - fromNode.setId(srcDataset.getId()); - fromNode.setName(srcDataset.getName()); - fromNode.setDescription(srcDataset.getDescription()); - fromNode.setNodeType(NodeType.DATASET); - - LineageNode toNode = new LineageNode(); - toNode.setId(destDataset.getId()); - toNode.setName(destDataset.getName()); - toNode.setDescription(destDataset.getDescription()); - toNode.setNodeType(NodeType.DATASET); - - LineageEdge edge = new LineageEdge(); - edge.setProcessId(task.getId()); - edge.setName(task.getName()); - edge.setDescription(task.getDescription()); - edge.setEdgeType(EdgeType.DATA_CLEANING); - edge.setFromNodeId(fromNode.getId()); - edge.setToNodeId(toNode.getId()); - - lineageService.generateGraph(fromNode, edge, toNode); - } - - public CleaningTaskDto getTask(String taskId) { - CleaningTaskDto task = cleaningTaskRepo.findTaskById(taskId); - setProcess(task); - task.setInstance(operatorInstanceRepo.findOperatorByInstanceId(taskId)); - return task; - } - - public List getTaskResults(String taskId) { - return cleaningResultRepo.findByInstanceId(taskId); - } - - public List getTaskLog(String taskId, int retryCount) { - cleanTaskValidator.checkTaskId(taskId); - String logPath = FLOW_PATH + "/" + taskId + "/output.log"; - if (retryCount > 0) { - logPath += "." + retryCount; - } - try (Stream lines = Files.lines(Paths.get(logPath))) { - List logs = new ArrayList<>(); - AtomicReference lastLevel = new AtomicReference<>("INFO"); - lines.forEach(line -> { - lastLevel.set(getLogLevel(line, lastLevel.get())); - CleaningTaskLog log = new CleaningTaskLog(); - log.setLevel(lastLevel.get()); - log.setMessage(line); - logs.add(log); - }); - return logs; - } catch (IOException e) { - log.error("Fail to read log file {}", logPath, e); - return Collections.emptyList(); - } - } - - private String getLogLevel(String logLine, String defaultLevel) { - if (logLine == null || logLine.trim().isEmpty()) { - return defaultLevel; - } - - Matcher stdMatcher = STANDARD_LEVEL_PATTERN.matcher(logLine); - if (stdMatcher.find()) { - return stdMatcher.group(1).toUpperCase(); - } - - Matcher exMatcher = EXCEPTION_SUFFIX_PATTERN.matcher(logLine); - if (exMatcher.find()) { - String match = exMatcher.group(1).toUpperCase(); - if ("WARNING".equals(match)) return "WARN"; - if ("ERROR".equals(match) || "EXCEPTION".equals(match)) return "ERROR"; - } - return defaultLevel; - } - - @Transactional - public void deleteTask(String taskId) { - cleanTaskValidator.checkTaskId(taskId); - cleaningTaskRepo.deleteTaskById(taskId); - operatorInstanceRepo.deleteByInstanceId(taskId); - cleaningResultRepo.deleteByInstanceId(taskId); - try { - FileUtils.deleteDirectory(new File(FLOW_PATH + "/" + taskId)); - } catch (IOException e) { - log.warn("Can't delete flow path with task id: {}.", taskId, e); - } - } - - public void executeTask(String taskId) { - List succeed = cleaningResultRepo.findByInstanceId(taskId, "COMPLETED"); - Set succeedSet = succeed.stream().map(CleaningResultDto::getSrcFileId).collect(Collectors.toSet()); - CleaningTaskDto task = cleaningTaskRepo.findTaskById(taskId); - scanDataset(taskId, task.getSrcDatasetId(), succeedSet); - cleaningResultRepo.deleteByInstanceId(taskId, "FAILED"); - taskScheduler.executeTask(taskId, task.getRetryCount() + 1); - } - - private void prepareTask(CleaningTaskDto task, List instances, ExecutorType executorType) { - List allOperators = operatorRepo.findAllOperators(); - Map operatorDtoMap = allOperators.stream() - .collect(Collectors.toMap(OperatorDto::getId, Function.identity())); - - TaskProcess process = new TaskProcess(); - process.setInstanceId(task.getId()); - process.setDatasetId(task.getDestDatasetId()); - process.setExecutorType(executorType.getValue()); - process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl"); - process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId()); - process.setProcess(instances.stream() - .map(instance -> { - OperatorDto operatorDto = operatorDtoMap.get(instance.getId()); - Map stringObjectMap = getDefaultValue(operatorDto); - stringObjectMap.putAll(instance.getOverrides()); - Map runtime = getRuntime(operatorDto); - stringObjectMap.putAll(runtime); - return Map.of(instance.getId(), stringObjectMap); - }) - .toList()); - - ObjectMapper jsonMapper = new ObjectMapper(new YAMLFactory()); - jsonMapper.setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE); - JsonNode jsonNode = jsonMapper.valueToTree(process); - - DumperOptions options = new DumperOptions(); - options.setIndent(2); - options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK); - Yaml yaml = new Yaml(options); - - File file = new File(FLOW_PATH + "/" + task.getId() + "/process.yaml"); - file.getParentFile().mkdirs(); - - try (FileWriter writer = new FileWriter(file)) { - yaml.dump(jsonMapper.treeToValue(jsonNode, Map.class), writer); - } catch (IOException e) { - log.error("Failed to prepare process.yaml.", e); - throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR); - } - } - - private Map getDefaultValue(OperatorDto operatorDto) { - if (StringUtils.isBlank(operatorDto.getSettings())) { - return new HashMap<>(); - } - - Map defaultSettings = new HashMap<>(); - try { - Map > settings = OBJECT_MAPPER.readValue(operatorDto.getSettings(), Map.class); - for (Map.Entry > entry : settings.entrySet()) { - String key = entry.getKey(); - Map setting = entry.getValue(); - String type = setting.get("type").toString(); - switch (type) { - case "slider": - case "switch": - case "select": - case "input": - case "radio": - case "checkbox": - if (setting.containsKey("defaultVal")) { - defaultSettings.put(key, setting.get("defaultVal")); - } - break; - case "range": - List