diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 75bc46c0c..c416dadb6 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1746,8 +1746,10 @@ def _adjust_immediate_packing_and_saving(self): self.low_cpu_mem_usage = False self.is_immediate_saving = False - if self.is_immediate_saving and "int" not in self.data_type: - logger.warning("immediate_saving is only supported for int quantization, set to False") + if self.is_immediate_saving and not ( + "int" in self.data_type or is_nv_fp(self.data_type) or is_mx_fp(self.data_type) + ): + logger.warning("immediate_saving is only supported for int/nv_fp/mx_fp quantization, set to False") self.is_immediate_saving = False if self.orig_output_dir is None: diff --git a/auto_round/compressors/shard_writer.py b/auto_round/compressors/shard_writer.py index f061a92b9..f89ba714a 100644 --- a/auto_round/compressors/shard_writer.py +++ b/auto_round/compressors/shard_writer.py @@ -146,10 +146,11 @@ def _handle_tied_weights(self): filtered_tensors[name] = tensor continue - ptr = tensor.untyped_storage().data_ptr() + ptr = tensor.untyped_storage().data_ptr() + tensor.storage_offset() * tensor.element_size() if ptr not in storage_map: storage_map.add(ptr) filtered_tensors[name] = tensor + self.current_shard_tensors = filtered_tensors def _flush_shard(self): diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py index ff7a9e732..ea2e5a6ed 100644 --- a/test/test_cpu/models/test_moe_model.py +++ b/test/test_cpu/models/test_moe_model.py @@ -64,13 +64,7 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path): # verify the quantized model can be loaded and run inference loaded_model = GptOssForCausalLM.from_pretrained(output_dir) - for n, m in quantized_model.named_modules(): - if m.__class__.__name__ == "QuantLinear": - loaded_m = loaded_model.get_submodule(n) - if scheme == "MXFP4": - assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() - if scheme == "MXFP8": - assert (loaded_m.weight.to("cpu") == m.weight.to("cpu")).all() + inp = torch.randint(0, 100, (1, 32)) with torch.inference_mode(): loaded_out = loaded_model(inp) @@ -84,10 +78,7 @@ def test_llama4(tiny_llama4_model_path): assert quantized_model is not None, "Quantized model should not be None." loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir) - for n, m in quantized_model.named_modules(): - if m.__class__.__name__ == "QuantLinear": - loaded_m = loaded_model.get_submodule(n) - assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() + inp = torch.randint(0, 100, (1, 32)) with torch.inference_mode(): loaded_out = loaded_model(inp) @@ -110,10 +101,6 @@ def test_qwen3_vl_moe_mxfp(tiny_qwen3_vl_moe_model_path): assert quantized_model is not None, "Quantized model should not be None." loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu") - for n, m in quantized_model.named_modules(): - if m.__class__.__name__ == "QuantLinear": - loaded_m = loaded_model.get_submodule(n) - assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() inp = torch.randint(0, 100, (1, 32)) with torch.inference_mode(): loaded_out = loaded_model(inp) diff --git a/test/test_cuda/models/test_fp8_model.py b/test/test_cuda/models/test_fp8_model.py index 1b7e97e4d..e8343b0b2 100644 --- a/test/test_cuda/models/test_fp8_model.py +++ b/test/test_cuda/models/test_fp8_model.py @@ -126,6 +126,7 @@ def test_qwen3_fp8_moe_mxfp(tiny_fp8_qwen_moe_model_path, mock_fp8_capable_devic nsamples=2, seqlen=32, iters=0, + low_cpu_mem_usage=False, ) quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None."