diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index a62e0124b77..b90bb49ef87 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -44,6 +44,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the pip install llmcompressor ``` +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```console +pip install vllm lm-eval==0.4.4 +``` + ## Quantization Process The quantization process involves three main steps: @@ -86,7 +92,7 @@ recipe = QuantizationModifier( # Apply the quantization algorithm. oneshot(model=model, recipe=recipe) -# Save the model. +# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) @@ -94,12 +100,6 @@ tokenizer.save_pretrained(SAVE_DIR) ### 3. Evaluating Accuracy -Install `vllm` and `lm-evaluation-harness`: - -```console -pip install vllm lm-eval==0.4.4 -``` - Load and run the model in `vllm`: ```python diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md index f8939e5bf01..be48788a4ef 100644 --- a/docs/source/features/quantization/int4.md +++ b/docs/source/features/quantization/int4.md @@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor]( pip install llmcompressor ``` +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```console +pip install vllm lm-eval==0.4.4 +``` + ## Quantization Process The quantization process involves four main steps: @@ -87,7 +93,7 @@ oneshot( num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save the compressed model +# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index b381f34bccd..d6ddca18e26 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor]( pip install llmcompressor ``` +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```console +pip install vllm lm-eval==0.4.4 +``` + ## Quantization Process The quantization process involves four main steps: @@ -91,7 +97,7 @@ oneshot( num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save the compressed model +# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md index 9f36c2949e0..86e6354ec82 100644 --- a/docs/source/features/quantization/quantized_kvcache.md +++ b/docs/source/features/quantization/quantized_kvcache.md @@ -126,7 +126,7 @@ oneshot( num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Save quantized model +# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md index 935ee37a815..955890dbc75 100644 --- a/docs/source/features/quantization/quark.md +++ b/docs/source/features/quantization/quark.md @@ -19,6 +19,12 @@ pip install amd-quark You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html) for more installation details. +Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: + +```console +pip install vllm lm-eval==0.4.4 +``` + ## Quantization Process After installing Quark, we will use an example to illustrate how to use Quark. @@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] export_config = ExporterConfig(json_export_config=JsonExporterConfig()) export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP +# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) with torch.no_grad():