From 46f347b942a52818397a4340cd20946bcfa579ae Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Fri, 23 Jan 2026 17:42:44 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E9=87=8D=E6=96=B0=E7=BB=84?= =?UTF-8?q?=E7=BB=87=E6=B8=85=E6=B4=97=E6=A8=A1=E6=9D=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/db/data-cleaning-init.sql | 106 ++++++++++++++++-------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/scripts/db/data-cleaning-init.sql b/scripts/db/data-cleaning-init.sql index 93322f44..f3f370b9 100644 --- a/scripts/db/data-cleaning-init.sql +++ b/scripts/db/data-cleaning-init.sql @@ -128,56 +128,66 @@ CREATE TRIGGER update_clean_template_updated_at -- 插入初始数据 - 清洗模板 INSERT INTO t_clean_template (id, name, description) VALUES - ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清洗模板'), - ('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板') - ON CONFLICT (id) DO NOTHING; + ('550e8400-e29b-41d4-a716-446655440001', '安全与隐私合规处理模板', '针对敏感数据进行严格清洗,移除PII(个人身份信息)、政治敏感、暴力色情内容,适用于模型对外发布前的安全合规检查。'), + ('661f9500-f3ac-52e5-b827-557766550002', 'LLM SFT高质量文本清洗模板', '旨在生成高质量、低噪声的训练数据。包含去除乱码、重复内容、繁简转换、全角转半角以及格式标准化处理。'), + ('772a0611-a4bd-63f6-c938-668877660003', 'RAG知识库构建预处理模板', '专为RAG场景设计。重点去除目录、图注、XML/HTML标签等对向量检索无意义的噪声,并进行段落级去重以优化切片质量。'), + ('883b1722-b5ce-7407-d049-779988770004', '原始Web爬虫数据清洗模板', '针对互联网爬取的脏数据进行清洗。重点去除Emoji表情、URL链接、HTML标签以及不可见字符。'), + ('994c2833-c6df-8518-e150-880099880005', '多模态/CV模型训练预处理模板', '针对图像数据集处理。包含去除模糊/重复/相似图片,图片方向校正,目标检测预标注,以及尺寸和格式的统一化。') +ON CONFLICT (id) DO NOTHING; --- 插入初始数据 - 操作员实例(文本清洗模板) INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) VALUES - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, NULL), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, NULL) - ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; - --- 插入初始数据 - 操作员实例(图片清洗模板) + ('550e8400-e29b-41d4-a716-446655440001', 'PoliticalWordCleaner', 1, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'SexualAndViolentWordCleaner', 2, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'PiiDetector', 3, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedIdNumber', 4, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedCreditCardNumber', 5, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedPhoneNumber', 6, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'EmailNumberCleaner', 7, NULL), + ('550e8400-e29b-41d4-a716-446655440001', 'AnonymizedIpAddress', 8, NULL) +ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; + +INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) +VALUES + ('661f9500-f3ac-52e5-b827-557766550002', 'GrableCharactersCleaner', 1, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'InvisibleCharactersCleaner', 2, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'FullWidthCharacterCleaner', 3, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'TraditionalChineseCleaner', 4, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithShortOrLongLengthFilter', 5, '{"fileLength": {"defaultVal": [50, 8192]}}'), + ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighRepeatPhraseRateFilter', 6, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighSpecialCharRateFilter', 7, NULL), + ('661f9500-f3ac-52e5-b827-557766550002', 'DuplicateFilesFilter', 8, NULL) +ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; + + +INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) +VALUES + ('772a0611-a4bd-63f6-c938-668877660003', 'HtmlTagCleaner', 1, '{"removeTableTags": {"defaultVal": "false"}}'), -- 表格对RAG可能有价值,暂不去除表格 + ('772a0611-a4bd-63f6-c938-668877660003', 'ContentCleaner', 2, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'LegendCleaner', 3, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'XMLTagCleaner', 4, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'UnicodeSpaceCleaner', 5, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'ExtraSpaceCleaner', 6, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'DuplicateSentencesFilter', 7, NULL), + ('772a0611-a4bd-63f6-c938-668877660003', 'FileWithShortOrLongLengthFilter', 8, '{"fileLength": {"defaultVal": [20, 100000]}}') +ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; + +INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) +VALUES + ('883b1722-b5ce-7407-d049-779988770004', 'HtmlTagCleaner', 1, '{"removeTableTags": {"defaultVal": "true"}}'), + ('883b1722-b5ce-7407-d049-779988770004', 'AnonymizedUrlCleaner', 2, NULL), + ('883b1722-b5ce-7407-d049-779988770004', 'EmojiCleaner', 3, NULL), + ('883b1722-b5ce-7407-d049-779988770004', 'InvisibleCharactersCleaner', 4, NULL), + ('883b1722-b5ce-7407-d049-779988770004', 'ExtraSpaceCleaner', 5, NULL), + ('883b1722-b5ce-7407-d049-779988770004', 'DuplicateFilesFilter', 6, '{"fileDuplicateThreshold": {"defaultVal": 0.6}}') +ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; + INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) VALUES - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, NULL), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, NULL) - ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; \ No newline at end of file + ('994c2833-c6df-8518-e150-880099880005', 'ImgBlurredImagesCleaner', 1, NULL), + ('994c2833-c6df-8518-e150-880099880005', 'ImgDuplicatedImagesCleaner', 2, NULL), + ('994c2833-c6df-8518-e150-880099880005', 'ImgSimilarImagesCleaner', 3, NULL), + ('994c2833-c6df-8518-e150-880099880005', 'ImgDirectionCorrect', 4, NULL), + ('994c2833-c6df-8518-e150-880099880005', 'ImgResize', 5, '{"targetSize": {"properties": [{"name": "宽度", "defaultVal": 512}, {"name": "高度", "defaultVal": 512}]}}'), + ('994c2833-c6df-8518-e150-880099880005', 'ImgTypeUnify', 6, '{"imgType": {"defaultVal": "jpg"}}') +ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; \ No newline at end of file From ee26b20194afc82e200cd87169d99ce118fa229a Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Thu, 12 Feb 2026 17:08:10 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=A8=A1=E7=89=88=E6=8B=86=E5=88=86?= =?UTF-8?q?=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../interface/cleaning_task_routes.py | 1 + runtime/ops/mapper/img_resize/metadata.yml | 34 +++++++++---------- runtime/ops/mapper/img_resize/process.py | 4 ++- scripts/db/data-cleaning-init.sql | 14 ++++---- scripts/db/data-operator-init.sql | 2 +- 5 files changed, 28 insertions(+), 27 deletions(-) diff --git a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py index 1f8cba2b..732c1266 100644 --- a/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py +++ b/runtime/datamate-python/app/module/cleaning/interface/cleaning_task_routes.py @@ -181,6 +181,7 @@ async def stop_cleaning_task( """Stop cleaning task""" task_service = _get_task_service(db) await task_service.stop_task(db, task_id) + await db.commit() return StandardResponse(code="0", message="success", data=task_id) diff --git a/runtime/ops/mapper/img_resize/metadata.yml b/runtime/ops/mapper/img_resize/metadata.yml index d680629c..a5895bdc 100644 --- a/runtime/ops/mapper/img_resize/metadata.yml +++ b/runtime/ops/mapper/img_resize/metadata.yml @@ -15,21 +15,19 @@ effect: inputs: 'image' outputs: 'image' settings: - targetSize: - name: 重采样尺寸 - type: multiple - properties: - - type: inputNumber - name: 宽度 - description: 像素 - defaultVal: 256 - min: 1 - max: 4096 - step: 1 - - type: inputNumber - name: 高度 - description: 像素 - defaultVal: 256 - min: 1 - max: 4096 - step: 1 \ No newline at end of file + widthSize: + name: 宽度 + type: inputNumber + description: 像素 + defaultVal: 256 + min: 1 + max: 4096 + step: 1 + heightSize: + type: inputNumber + name: 高度 + description: 像素 + defaultVal: 256 + min: 1 + max: 4096 + step: 1 \ No newline at end of file diff --git a/runtime/ops/mapper/img_resize/process.py b/runtime/ops/mapper/img_resize/process.py index 248237b5..5c444b56 100644 --- a/runtime/ops/mapper/img_resize/process.py +++ b/runtime/ops/mapper/img_resize/process.py @@ -17,7 +17,9 @@ class ImgResize(Mapper): def __init__(self, *args, **kwargs): super(ImgResize, self).__init__(*args, **kwargs) - self._target_size = kwargs.get("targetSize", [256, 256]) + self._width = int(kwargs.get("widthSize", 256)) + self._height = int(kwargs.get("heightSize", 256)) + self._target_size = [self._width, self._height] @classmethod def _img_resize(cls, data: List[float], target_size: List[int]) -> List[float]: diff --git a/scripts/db/data-cleaning-init.sql b/scripts/db/data-cleaning-init.sql index 0440729f..58b7add1 100644 --- a/scripts/db/data-cleaning-init.sql +++ b/scripts/db/data-cleaning-init.sql @@ -155,7 +155,7 @@ VALUES ('661f9500-f3ac-52e5-b827-557766550002', 'InvisibleCharactersCleaner', 2, NULL), ('661f9500-f3ac-52e5-b827-557766550002', 'FullWidthCharacterCleaner', 3, NULL), ('661f9500-f3ac-52e5-b827-557766550002', 'TraditionalChineseCleaner', 4, NULL), - ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithShortOrLongLengthFilter', 5, '{"fileLength": {"defaultVal": [50, 8192]}}'), + ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithShortOrLongLengthFilter', 5, '{"fileLength": [50, 8192]}'), ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighRepeatPhraseRateFilter', 6, NULL), ('661f9500-f3ac-52e5-b827-557766550002', 'FileWithHighSpecialCharRateFilter', 7, NULL), ('661f9500-f3ac-52e5-b827-557766550002', 'DuplicateFilesFilter', 8, NULL) @@ -164,24 +164,24 @@ ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) VALUES - ('772a0611-a4bd-63f6-c938-668877660003', 'HtmlTagCleaner', 1, '{"removeTableTags": {"defaultVal": "false"}}'), -- 表格对RAG可能有价值,暂不去除表格 + ('772a0611-a4bd-63f6-c938-668877660003', 'HtmlTagCleaner', 1, '{"removeTableTags": "false"}'), -- 表格对RAG可能有价值,暂不去除表格 ('772a0611-a4bd-63f6-c938-668877660003', 'ContentCleaner', 2, NULL), ('772a0611-a4bd-63f6-c938-668877660003', 'LegendCleaner', 3, NULL), ('772a0611-a4bd-63f6-c938-668877660003', 'XMLTagCleaner', 4, NULL), ('772a0611-a4bd-63f6-c938-668877660003', 'UnicodeSpaceCleaner', 5, NULL), ('772a0611-a4bd-63f6-c938-668877660003', 'ExtraSpaceCleaner', 6, NULL), ('772a0611-a4bd-63f6-c938-668877660003', 'DuplicateSentencesFilter', 7, NULL), - ('772a0611-a4bd-63f6-c938-668877660003', 'FileWithShortOrLongLengthFilter', 8, '{"fileLength": {"defaultVal": [20, 100000]}}') + ('772a0611-a4bd-63f6-c938-668877660003', 'FileWithShortOrLongLengthFilter', 8, '{"fileLength": [20, 100000]}') ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) VALUES - ('883b1722-b5ce-7407-d049-779988770004', 'HtmlTagCleaner', 1, '{"removeTableTags": {"defaultVal": "true"}}'), + ('883b1722-b5ce-7407-d049-779988770004', 'HtmlTagCleaner', 1, '{"removeTableTags": "true"}'), ('883b1722-b5ce-7407-d049-779988770004', 'AnonymizedUrlCleaner', 2, NULL), ('883b1722-b5ce-7407-d049-779988770004', 'EmojiCleaner', 3, NULL), ('883b1722-b5ce-7407-d049-779988770004', 'InvisibleCharactersCleaner', 4, NULL), ('883b1722-b5ce-7407-d049-779988770004', 'ExtraSpaceCleaner', 5, NULL), - ('883b1722-b5ce-7407-d049-779988770004', 'DuplicateFilesFilter', 6, '{"fileDuplicateThreshold": {"defaultVal": 0.6}}') + ('883b1722-b5ce-7407-d049-779988770004', 'DuplicateFilesFilter', 6, '{"fileDuplicateThreshold": 0.6}') ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; INSERT INTO t_operator_instance (instance_id, operator_id, op_index, settings_override) @@ -190,6 +190,6 @@ VALUES ('994c2833-c6df-8518-e150-880099880005', 'ImgDuplicatedImagesCleaner', 2, NULL), ('994c2833-c6df-8518-e150-880099880005', 'ImgSimilarImagesCleaner', 3, NULL), ('994c2833-c6df-8518-e150-880099880005', 'ImgDirectionCorrect', 4, NULL), - ('994c2833-c6df-8518-e150-880099880005', 'ImgResize', 5, '{"targetSize": {"properties": [{"name": "宽度", "defaultVal": 512}, {"name": "高度", "defaultVal": 512}]}}'), - ('994c2833-c6df-8518-e150-880099880005', 'ImgTypeUnify', 6, '{"imgType": {"defaultVal": "jpg"}}') + ('994c2833-c6df-8518-e150-880099880005', 'ImgResize', 5, '{"widthSize": 512, "heightSize": 512}'), + ('994c2833-c6df-8518-e150-880099880005', 'ImgTypeUnify', 6, '{"imgType": "jpg"}') ON CONFLICT (instance_id, operator_id, op_index) DO NOTHING; \ No newline at end of file diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index e6650e4c..286ec17b 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -207,7 +207,7 @@ VALUES ('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), ('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'), ('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 8192, false, 'system', 'system'), -('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "description": "像素", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', '', 8192, false, 'system', 'system'), +('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', null, '{"widthSize":{"name":"宽度","type":"inputNumber","description":"像素","defaultVal":256,"min":1,"max":4096,"step":1},"heightSize":{"type":"inputNumber","name":"高度","description":"像素","defaultVal":256,"min":1,"max":4096,"step":1}}', '', 8192, false, 'system', 'system'), ('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), ('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'), ('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 4096, false, 'system', 'system'),