inference-optimization
/

Qwen3-Coder-Next-NVFP4

compressed-tensors

Model card Files Files and versions

Qwen3-Coder-Next-NVFP4 / nvfp4_quantize.py

ChibuUkachi's picture

Upload folder using huggingface_hub

c450c31 verified 4 days ago

history blame contribute delete

2.22 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	from datasets import load_dataset

	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import QuantizationModifier
	from compressed_tensors.offload import dispatch_model

	MODEL_ID = "Qwen/Qwen3-Coder-Next"

	# Load model.
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype="auto",
	low_cpu_mem_usage=True,
	trust_remote_code=True,
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	DATASET_ID = "HuggingFaceH4/ultrachat_200k"
	DATASET_SPLIT = "train_sft"

	# Select number of samples
	NUM_CALIBRATION_SAMPLES = 20
	MAX_SEQUENCE_LENGTH = 2048

	# Load dataset and preprocess.
	ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
	ds = ds.shuffle(seed=42)


	def preprocess(example):
	return {
	"text": tokenizer.apply_chat_template(
	example["messages"],
	tokenize=False,
	)
	}


	ds = ds.map(preprocess)


	# Tokenize inputs.
	def tokenize(sample):
	return tokenizer(
	sample["text"],
	padding=False,
	max_length=MAX_SEQUENCE_LENGTH,
	truncation=True,
	add_special_tokens=False,
	)


	ds = ds.map(tokenize, remove_columns=ds.column_names)



	recipe = QuantizationModifier(
	targets="Linear",
	scheme="NVFP4",
	weight_observer="mse",
	ignore= ['re:.lm_head', 're:.mlp.gate$', 're:.mlp.shared_expert_gate$', 're:.linear_attn.*'],
	)


	oneshot(
	model=model,
	dataset=ds,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	moe_calibrate_all_experts=True,
	)


	print("\n\n")
	print("========== SAMPLE GENERATION ==============")

	dispatch_model(model)

	input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
	model.device
	)
	output = model.generate(input_ids, max_new_tokens=100)
	print(tokenizer.decode(output[0]))
	print("==========================================\n\n")


	# Save to disk in compressed-tensors format.
	SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
	model.save_pretrained(SAVE_DIR, save_compressed=True)
	tokenizer.save_pretrained(SAVE_DIR)