From c8467ff1092d9dc90818708fc3711a0675179770 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 2 Mar 2026 16:27:09 +0800 Subject: [PATCH 1/7] enhance llmc CI on GPU and XPU Signed-off-by: chensuyue --- .../integrations/test_llmc_integration.py | 250 +++++++++++++++++- test/test_xpu/test_llmc_integration.py | 249 +++++++++++++++++ 2 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 test/test_xpu/test_llmc_integration.py diff --git a/test/test_cuda/integrations/test_llmc_integration.py b/test/test_cuda/integrations/test_llmc_integration.py index 3422e3cdc..06daab31d 120000 --- a/test/test_cuda/integrations/test_llmc_integration.py +++ b/test/test_cuda/integrations/test_llmc_integration.py @@ -1 +1,249 @@ -../../test_cpu/integrations/test_llmc_integration.py \ No newline at end of file +import pytest +import torch +from auto_round.calib_dataset import get_dataset +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.autoround import AutoRoundModifier + +recipe_str = """ +quant_stage: + quant_modifiers: + AutoRoundModifier: + ignore: ["lm_head"] + iters: 10 + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: group + group_size: 128 +""" + +recipe_modifier_full = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, +) +recipe_modifier_nvfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=2, + scheme="NVFP4", +) + +recipe_modifier_mxfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + scheme="MXFP4", +) + +w8a8_dynamic_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="token", dynamic=True + ), + ) + }, +) + +w8a8_static_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="tensor" + ), + ) + }, +) + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") +@pytest.mark.parametrize( + "recipe", + [ + recipe_str, + recipe_modifier_full, + recipe_modifier_nvfp4, + recipe_modifier_mxfp4, + ], +) +def test_oneshot_application(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires at least 2 Cuda GPUs") +def test_oneshot_with_device_ids(tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=512, + nsamples=4, + ) + + device = "cuda:0" + + recipe = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, + device_ids="0,1", + ) + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") +@pytest.mark.parametrize( + "recipe", + [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], +) +def test_rtn_oneshot(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + act_args = quantization_config.config_groups["group_0"].input_activations + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits + assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy + if act_args is not None: + assert ( + act_args.num_bits + == recipe.config_groups["group_0"].input_activations.num_bits + ) + assert ( + act_args.strategy + == recipe.config_groups["group_0"].input_activations.strategy + ) + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file diff --git a/test/test_xpu/test_llmc_integration.py b/test/test_xpu/test_llmc_integration.py new file mode 100644 index 000000000..217e53195 --- /dev/null +++ b/test/test_xpu/test_llmc_integration.py @@ -0,0 +1,249 @@ +import pytest +import torch +from auto_round.calib_dataset import get_dataset +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.autoround import AutoRoundModifier + +recipe_str = """ +quant_stage: + quant_modifiers: + AutoRoundModifier: + ignore: ["lm_head"] + iters: 10 + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: group + group_size: 128 +""" + +recipe_modifier_full = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, +) +recipe_modifier_nvfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=2, + scheme="NVFP4", +) + +recipe_modifier_mxfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + scheme="MXFP4", +) + +w8a8_dynamic_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="token", dynamic=True + ), + ) + }, +) + +w8a8_static_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="tensor" + ), + ) + }, +) + + +@pytest.mark.skipif(torch.xpu.device_count() < 1, reason="test requires at least 1 XPU") +@pytest.mark.parametrize( + "recipe", + [ + recipe_str, + recipe_modifier_full, + recipe_modifier_nvfp4, + recipe_modifier_mxfp4, + ], +) +def test_oneshot_application(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "xpu:0" if torch.xpu.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.xpu.device_count() < 2, reason="test requires at least 2 XPUs") +def test_oneshot_with_device_ids(tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=512, + nsamples=4, + ) + + device = "xpu:0" + + recipe = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, + device_ids="0,1", + ) + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.xpu.device_count() < 1, reason="test requires at least 1 XPU") +@pytest.mark.parametrize( + "recipe", + [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], +) +def test_rtn_oneshot(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "xpu:0" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + act_args = quantization_config.config_groups["group_0"].input_activations + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits + assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy + if act_args is not None: + assert ( + act_args.num_bits + == recipe.config_groups["group_0"].input_activations.num_bits + ) + assert ( + act_args.strategy + == recipe.config_groups["group_0"].input_activations.strategy + ) + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file From a97014144171beb73f66d16b89aefaa94c3f088a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 08:41:47 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../integrations/test_llmc_integration.py | 249 ------------------ test/test_xpu/test_llmc_integration.py | 48 ++-- 2 files changed, 19 insertions(+), 278 deletions(-) delete mode 120000 test/test_cuda/integrations/test_llmc_integration.py diff --git a/test/test_cuda/integrations/test_llmc_integration.py b/test/test_cuda/integrations/test_llmc_integration.py deleted file mode 120000 index 06daab31d..000000000 --- a/test/test_cuda/integrations/test_llmc_integration.py +++ /dev/null @@ -1,249 +0,0 @@ -import pytest -import torch -from auto_round.calib_dataset import get_dataset -from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.autoround import AutoRoundModifier - -recipe_str = """ -quant_stage: - quant_modifiers: - AutoRoundModifier: - ignore: ["lm_head"] - iters: 10 - config_groups: - group_0: - targets: - - "Linear" - input_activations: null - output_activations: null - weights: - num_bits: 4 - type: "int" - symmetric: true - strategy: group - group_size: 128 -""" - -recipe_modifier_full = AutoRoundModifier( - ignore=["lm_head"], - iters=10, - config_groups={ - "group_0": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), - ) - }, -) -recipe_modifier_nvfp4 = AutoRoundModifier( - ignore=["lm_head"], - iters=2, - scheme="NVFP4", -) - -recipe_modifier_mxfp4 = AutoRoundModifier( - ignore=["lm_head"], - iters=0, - scheme="MXFP4", -) - -w8a8_dynamic_recipe_modifier = AutoRoundModifier( - ignore=["lm_head"], - iters=0, - config_groups={ - "group_0": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="token", dynamic=True - ), - ) - }, -) - -w8a8_static_recipe_modifier = AutoRoundModifier( - ignore=["lm_head"], - iters=0, - config_groups={ - "group_0": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="tensor" - ), - ) - }, -) - - -@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") -@pytest.mark.parametrize( - "recipe", - [ - recipe_str, - recipe_modifier_full, - recipe_modifier_nvfp4, - recipe_modifier_mxfp4, - ], -) -def test_oneshot_application(recipe, tmp_path): - output = tmp_path / "oneshot_output" - model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - tokenizer = AutoTokenizer.from_pretrained(model) - dataset = get_dataset( - tokenizer=tokenizer, - seqlen=1024, - nsamples=32, - ) - - device = "cuda:0" if torch.cuda.is_available() else "cpu" - - oneshot( - model=model, - dataset=dataset, - output_dir=output, - recipe=recipe, - ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) - - # Check that the model is quantized - # decompress() will attach a quantization_config to the model - # as we decompress right away - quantization_config = model_loaded.config.quantization_config.quantization_config - assert quantization_config is not None - - # check config is set properly - assert "lm_head" in quantization_config.ignore - assert len(quantization_config.config_groups) == 1 - quant_scheme = quantization_config.config_groups["group_0"] - assert isinstance(quant_scheme, QuantizationScheme) - - weight_args = quantization_config.config_groups["group_0"].weights - assert isinstance(weight_args, QuantizationArgs) - assert weight_args.num_bits == 4 - - # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") - - # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires at least 2 Cuda GPUs") -def test_oneshot_with_device_ids(tmp_path): - output = tmp_path / "oneshot_output" - model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - tokenizer = AutoTokenizer.from_pretrained(model) - dataset = get_dataset( - tokenizer=tokenizer, - seqlen=512, - nsamples=4, - ) - - device = "cuda:0" - - recipe = AutoRoundModifier( - ignore=["lm_head"], - iters=10, - config_groups={ - "group_0": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), - ) - }, - device_ids="0,1", - ) - - oneshot( - model=model, - dataset=dataset, - output_dir=output, - recipe=recipe, - ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) - - # Check that the model is quantized - # decompress() will attach a quantization_config to the model - # as we decompress right away - quantization_config = model_loaded.config.quantization_config.quantization_config - assert quantization_config is not None - - # check config is set properly - assert "lm_head" in quantization_config.ignore - assert len(quantization_config.config_groups) == 1 - quant_scheme = quantization_config.config_groups["group_0"] - assert isinstance(quant_scheme, QuantizationScheme) - - weight_args = quantization_config.config_groups["group_0"].weights - assert isinstance(weight_args, QuantizationArgs) - assert weight_args.num_bits == 4 - - # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") - - # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") - - -@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") -@pytest.mark.parametrize( - "recipe", - [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], -) -def test_rtn_oneshot(recipe, tmp_path): - output = tmp_path / "oneshot_output" - model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - tokenizer = AutoTokenizer.from_pretrained(model) - dataset = get_dataset( - tokenizer=tokenizer, - seqlen=1024, - nsamples=32, - ) - - device = "cuda:0" if torch.cuda.is_available() else "cpu" - - oneshot( - model=model, - dataset=dataset, - output_dir=output, - recipe=recipe, - ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) - - quantization_config = model_loaded.config.quantization_config.quantization_config - assert quantization_config is not None - - # check config is set properly - assert "lm_head" in quantization_config.ignore - assert len(quantization_config.config_groups) == 1 - quant_scheme = quantization_config.config_groups["group_0"] - assert isinstance(quant_scheme, QuantizationScheme) - - weight_args = quantization_config.config_groups["group_0"].weights - act_args = quantization_config.config_groups["group_0"].input_activations - assert isinstance(weight_args, QuantizationArgs) - assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits - assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy - if act_args is not None: - assert ( - act_args.num_bits - == recipe.config_groups["group_0"].input_activations.num_bits - ) - assert ( - act_args.strategy - == recipe.config_groups["group_0"].input_activations.strategy - ) - - # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") - - # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file diff --git a/test/test_xpu/test_llmc_integration.py b/test/test_xpu/test_llmc_integration.py index 217e53195..de95ccf0d 100644 --- a/test/test_xpu/test_llmc_integration.py +++ b/test/test_xpu/test_llmc_integration.py @@ -1,11 +1,11 @@ import pytest import torch -from auto_round.calib_dataset import get_dataset from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme -from transformers import AutoModelForCausalLM, AutoTokenizer - from llmcompressor import oneshot from llmcompressor.modifiers.autoround import AutoRoundModifier +from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round.calib_dataset import get_dataset recipe_str = """ quant_stage: @@ -56,9 +56,7 @@ "group_0": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="token", dynamic=True - ), + input_activations=QuantizationArgs(num_bits=8, type="float", strategy="token", dynamic=True), ) }, ) @@ -70,9 +68,7 @@ "group_0": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="tensor" - ), + input_activations=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), ) }, ) @@ -125,12 +121,12 @@ def test_oneshot_application(recipe, tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") @pytest.mark.skipif(torch.xpu.device_count() < 2, reason="test requires at least 2 XPUs") @@ -183,12 +179,12 @@ def test_oneshot_with_device_ids(tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") @pytest.mark.skipif(torch.xpu.device_count() < 1, reason="test requires at least 1 XPU") @@ -231,19 +227,13 @@ def test_rtn_oneshot(recipe, tmp_path): assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy if act_args is not None: - assert ( - act_args.num_bits - == recipe.config_groups["group_0"].input_activations.num_bits - ) - assert ( - act_args.strategy - == recipe.config_groups["group_0"].input_activations.strategy - ) + assert act_args.num_bits == recipe.config_groups["group_0"].input_activations.num_bits + assert act_args.strategy == recipe.config_groups["group_0"].input_activations.strategy # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") From 9119bcbaa2d49cf925c5aa373edab23b91a3b046 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 2 Mar 2026 16:54:00 +0800 Subject: [PATCH 3/7] update test requirements for xpu Signed-off-by: chensuyue --- .azure-pipelines/scripts/ut/run_ut_xpu.sh | 13 +++++++++++++ test/test_xpu/requirements_llmc.txt | 1 + 2 files changed, 14 insertions(+) create mode 100644 test/test_xpu/requirements_llmc.txt diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh index b8f7bb875..5f28c6698 100644 --- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh @@ -24,11 +24,24 @@ ut_log_name=${LOG_DIR}/ut.log find ./test_ark -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_ark.sh cat run_ark.sh find ./test_xpu -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_xpu.sh +sed -i "/test_llmc_integration.py/d" run_xpu.sh cat run_xpu.sh +find ./test_xpu -name "test_llmc_integration.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_xpu_llmc.sh +cat run_xpu_llmc.sh +echo "##[group]Run xpu integration test with xpu..." bash run_xpu.sh 2>&1 | tee "${ut_log_name}" +echo "##[endgroup]" +echo "##[group]Run Ark integration test with xpu..." numactl -C "0-27" -m 0 bash run_ark.sh 2>&1 | tee -a "${ut_log_name}" +echo "##[endgroup]" + +echo "##[group]Run LLMC integration test with xpu..." +uv pip install -r requirements_llmc.txt +uv pip list +bash run_xpu_llmc.sh 2>&1 | tee -a "${ut_log_name}" +echo "##[endgroup]" cp report.html ${LOG_DIR}/ cp coverage.xml ${LOG_DIR}/ diff --git a/test/test_xpu/requirements_llmc.txt b/test/test_xpu/requirements_llmc.txt new file mode 100644 index 000000000..0af08f61e --- /dev/null +++ b/test/test_xpu/requirements_llmc.txt @@ -0,0 +1 @@ +llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main From f805e19930a9b99ae49d6b4da7b9c552c6428df4 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 2 Mar 2026 22:14:08 +0800 Subject: [PATCH 4/7] add cuda llmc test back Signed-off-by: chensuyue --- .../integrations/test_llmc_integration.py | 249 ++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 test/test_cuda/integrations/test_llmc_integration.py diff --git a/test/test_cuda/integrations/test_llmc_integration.py b/test/test_cuda/integrations/test_llmc_integration.py new file mode 100644 index 000000000..d0e320564 --- /dev/null +++ b/test/test_cuda/integrations/test_llmc_integration.py @@ -0,0 +1,249 @@ +import pytest +import torch +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from llmcompressor import oneshot +from llmcompressor.modifiers.autoround import AutoRoundModifier +from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round.calib_dataset import get_dataset + +recipe_str = """ +quant_stage: + quant_modifiers: + AutoRoundModifier: + ignore: ["lm_head"] + iters: 10 + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: group + group_size: 128 +""" + +recipe_modifier_full = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, +) +recipe_modifier_nvfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=2, + scheme="NVFP4", +) + +recipe_modifier_mxfp4 = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + scheme="MXFP4", +) + +w8a8_dynamic_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="token", dynamic=True + ), + ) + }, +) + +w8a8_static_recipe_modifier = AutoRoundModifier( + ignore=["lm_head"], + iters=0, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), + input_activations=QuantizationArgs( + num_bits=8, type="float", strategy="tensor" + ), + ) + }, +) + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") +@pytest.mark.parametrize( + "recipe", + [ + recipe_str, + recipe_modifier_full, + recipe_modifier_nvfp4, + recipe_modifier_mxfp4, + ], +) +def test_oneshot_application(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires at least 2 Cuda GPUs") +def test_oneshot_with_device_ids(tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=512, + nsamples=4, + ) + + device = "cuda:0" + + recipe = AutoRoundModifier( + ignore=["lm_head"], + iters=10, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, + device_ids="0,1", + ) + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # decompress() will attach a quantization_config to the model + # as we decompress right away + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") +@pytest.mark.parametrize( + "recipe", + [w8a8_dynamic_recipe_modifier, w8a8_static_recipe_modifier], +) +def test_rtn_oneshot(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + act_args = quantization_config.config_groups["group_0"].input_activations + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits + assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy + if act_args is not None: + assert ( + act_args.num_bits + == recipe.config_groups["group_0"].input_activations.num_bits + ) + assert ( + act_args.strategy + == recipe.config_groups["group_0"].input_activations.strategy + ) + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file From 5becea12cee4400bdbff1a7756a9b8bb3f10083d Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 2 Mar 2026 22:21:24 +0800 Subject: [PATCH 5/7] fix req path Signed-off-by: chensuyue --- .azure-pipelines/scripts/ut/run_ut_xpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh index 5f28c6698..07648c3ac 100644 --- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh @@ -38,7 +38,7 @@ numactl -C "0-27" -m 0 bash run_ark.sh 2>&1 | tee -a "${ut_log_name}" echo "##[endgroup]" echo "##[group]Run LLMC integration test with xpu..." -uv pip install -r requirements_llmc.txt +uv pip install -r ./test_xpu/requirements_llmc.txt uv pip list bash run_xpu_llmc.sh 2>&1 | tee -a "${ut_log_name}" echo "##[endgroup]" From 234955ccaee87a2c183a110b6661eeff03efd9a3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 14:23:10 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../integrations/test_llmc_integration.py | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/test/test_cuda/integrations/test_llmc_integration.py b/test/test_cuda/integrations/test_llmc_integration.py index d0e320564..c98faf7bf 100644 --- a/test/test_cuda/integrations/test_llmc_integration.py +++ b/test/test_cuda/integrations/test_llmc_integration.py @@ -56,9 +56,7 @@ "group_0": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs(num_bits=8, type="float", strategy="channel"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="token", dynamic=True - ), + input_activations=QuantizationArgs(num_bits=8, type="float", strategy="token", dynamic=True), ) }, ) @@ -70,9 +68,7 @@ "group_0": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), - input_activations=QuantizationArgs( - num_bits=8, type="float", strategy="tensor" - ), + input_activations=QuantizationArgs(num_bits=8, type="float", strategy="tensor"), ) }, ) @@ -125,12 +121,12 @@ def test_oneshot_application(recipe, tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires at least 2 Cuda GPUs") @@ -183,12 +179,12 @@ def test_oneshot_with_device_ids(tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires at least 1 Cuda GPU") @@ -231,19 +227,13 @@ def test_rtn_oneshot(recipe, tmp_path): assert weight_args.num_bits == recipe.config_groups["group_0"].weights.num_bits assert weight_args.strategy == recipe.config_groups["group_0"].weights.strategy if act_args is not None: - assert ( - act_args.num_bits - == recipe.config_groups["group_0"].input_activations.num_bits - ) - assert ( - act_args.strategy - == recipe.config_groups["group_0"].input_activations.strategy - ) + assert act_args.num_bits == recipe.config_groups["group_0"].input_activations.num_bits + assert act_args.strategy == recipe.config_groups["group_0"].input_activations.strategy # Check a specific layer is quantized - targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj - assert hasattr(targetted_linear_layer, "quantization_scheme") + targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized - not_targetted = model_loaded.lm_head - assert not hasattr(not_targetted, "quantization_scheme") \ No newline at end of file + not_targeted = model_loaded.lm_head + assert not hasattr(not_targeted, "quantization_scheme") From 89c0d81301f8f300207bfd836e0acb56dfbfa6a5 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 3 Mar 2026 17:10:40 +0800 Subject: [PATCH 7/7] for test Signed-off-by: chensuyue --- test/test_xpu/test_llmc_integration.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/test_xpu/test_llmc_integration.py b/test/test_xpu/test_llmc_integration.py index de95ccf0d..2b0310306 100644 --- a/test/test_xpu/test_llmc_integration.py +++ b/test/test_xpu/test_llmc_integration.py @@ -94,15 +94,13 @@ def test_oneshot_application(recipe, tmp_path): nsamples=32, ) - device = "xpu:0" if torch.xpu.is_available() else "cpu" - oneshot( model=model, dataset=dataset, output_dir=output, recipe=recipe, ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map="xpu") # Check that the model is quantized # decompress() will attach a quantization_config to the model @@ -140,8 +138,6 @@ def test_oneshot_with_device_ids(tmp_path): nsamples=4, ) - device = "xpu:0" - recipe = AutoRoundModifier( ignore=["lm_head"], iters=10, @@ -160,7 +156,7 @@ def test_oneshot_with_device_ids(tmp_path): output_dir=output, recipe=recipe, ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map="xpu") # Check that the model is quantized # decompress() will attach a quantization_config to the model @@ -202,15 +198,13 @@ def test_rtn_oneshot(recipe, tmp_path): nsamples=32, ) - device = "xpu:0" - oneshot( model=model, dataset=dataset, output_dir=output, recipe=recipe, ) - model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map="xpu") quantization_config = model_loaded.config.quantization_config.quantization_config assert quantization_config is not None