From c2e9631b53f39733d6880851159d68c9c6e39188 Mon Sep 17 00:00:00 2001 From: yibotongxue <161041627+yibotongxue@users.noreply.github.com> Date: Sat, 4 Apr 2026 21:47:08 +0800 Subject: [PATCH] docs: update docs for mmd operator --- .../operators/text_sft/eval/MMDDatasetEvaluator.md | 14 ++++++-------- .../operators/text_sft/eval/MMDDatasetEvaluator.md | 14 ++++++-------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/docs/en/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md b/docs/en/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md index 7e4073158..debe6962a 100644 --- a/docs/en/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md +++ b/docs/en/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md @@ -15,8 +15,8 @@ def __init__( self, ref_frame: DataFlowStorage, *, - ref_max_sample_num: int = 5000, - ref_shuffle_seed: int = 42, + max_sample_num: int = 5000, + shuffle_seed: int = 42, ref_instruction_key: str = "input", ref_output_key: str = "output", kernel_type: Literal["RBF"] = "RBF", @@ -43,8 +43,8 @@ def __init__( | Parameter | Type | Default | Description | | :--- | :--- | :--- | :--- | | **ref_frame** | DataFlowStorage | Required | The reference dataset used as the distribution baseline. | -| **ref_max_sample_num** | int | `5000` | Maximum number of samples to draw from the reference dataset. | -| **ref_shuffle_seed** | int | `42` | Random seed for sampling the reference dataset. | +| **max_sample_num** | int | `5000` | Maximum number of samples to draw from both the reference and evaluation datasets. | +| **shuffle_seed** | int | `42` | Random seed for sampling. | | **ref_instruction_key** | str | `'input'` | Column name for the instruction field in the reference dataset. | | **ref_output_key** | str | `'output'` | Column name for the output field in the reference dataset. | | **kernel_type** | str | `'RBF'` | Kernel function type; currently only `'RBF'` is supported. | @@ -74,8 +74,6 @@ def run( storage: DataFlowStorage, input_instruction_key: str, input_output_key: str, - max_sample_num: int | None = None, - shuffle_seed: int | None = None, ) -> tuple[float, dict[str, Any]] ``` @@ -84,8 +82,6 @@ def run( | **storage** | DataFlowStorage | Required | The DataFlowStorage instance containing the evaluation dataset. | | **input_instruction_key** | str | Required | Column name for the instruction field in the evaluation dataset. | | **input_output_key** | str | Required | Column name for the output field in the evaluation dataset. | -| **max_sample_num** | int | `None` | Maximum samples from the evaluation dataset; falls back to `ref_max_sample_num` if not set. | -| **shuffle_seed** | int | `None` | Random seed for sampling the evaluation dataset; falls back to `ref_shuffle_seed` if not set. | ## 🧠 Example Usage @@ -100,6 +96,8 @@ eval_storage = FileStorage(first_entry_file_name="eval_data.jsonl") # Initialize the evaluator evaluator = MMDDatasetEvaluator( ref_frame=ref_storage.step(), + max_sample_num=5000, + shuffle_seed=42, ref_instruction_key="instruction", ref_output_key="output", embedding_type="sentence_transformers", diff --git a/docs/zh/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md b/docs/zh/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md index 4446190af..fcea2de25 100644 --- a/docs/zh/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md +++ b/docs/zh/notes/api/operators/text_sft/eval/MMDDatasetEvaluator.md @@ -15,8 +15,8 @@ def __init__( self, ref_frame: DataFlowStorage, *, - ref_max_sample_num: int = 5000, - ref_shuffle_seed: int = 42, + max_sample_num: int = 5000, + shuffle_seed: int = 42, ref_instruction_key: str = "input", ref_output_key: str = "output", kernel_type: Literal["RBF"] = "RBF", @@ -45,8 +45,8 @@ def __init__( | 参数名 | 类型 | 默认值 | 说明 | | :--- | :--- | :--- | :--- | | **ref_frame** | DataFlowStorage | 必需 | 参考数据集(DataFlowStorage),作为分布比较的基准。 | -| **ref_max_sample_num** | int | `5000` | 从参考数据集中采样的最大样本数。 | -| **ref_shuffle_seed** | int | `42` | 参考数据集采样的随机种子。 | +| **max_sample_num** | int | `5000` | 从参考数据集和评估数据集中采样的最大样本数。 | +| **shuffle_seed** | int | `42` | 数据集采样的随机种子。 | | **ref_instruction_key** | str | `'input'` | 参考数据集中指令字段的列名。 | | **ref_output_key** | str | `'output'` | 参考数据集中输出字段的列名。 | | **kernel_type** | str | `'RBF'` | 核函数类型,当前仅支持 `'RBF'`。 | @@ -76,8 +76,6 @@ def run( storage: DataFlowStorage, input_instruction_key: str, input_output_key: str, - max_sample_num: int | None = None, - shuffle_seed: int | None = None, ) -> tuple[float, dict[str, Any]] ``` @@ -90,8 +88,6 @@ def run( | **storage** | DataFlowStorage | 必需 | 包含评估数据集的数据流存储实例。 | | **input_instruction_key** | str | 必需 | 评估数据集中指令字段的列名。 | | **input_output_key** | str | 必需 | 评估数据集中输出字段的列名。 | -| **max_sample_num** | int | `None` | 评估数据集的最大采样数;未设置时默认使用 `ref_max_sample_num`。 | -| **shuffle_seed** | int | `None` | 评估数据集采样的随机种子;未设置时默认使用 `ref_shuffle_seed`。 | ## 🧠 示例用法 @@ -106,6 +102,8 @@ eval_storage = FileStorage(first_entry_file_name="eval_data.jsonl") # 初始化评估器 evaluator = MMDDatasetEvaluator( ref_frame=ref_storage.step(), + max_sample_num=5000, + shuffle_seed=42, ref_instruction_key="instruction", ref_output_key="output", embedding_type="sentence_transformers",