[doc] add install tips (vllm-project#17373)

reidliu41 · reidliu41 · Mu Huai · commit cdefd500f8e8 · 2025-05-12T19:19:24.000+08:00
Signed-off-by: reidliu41 &lt;reid201711@gmail.com&gt;
Co-authored-by: reidliu41 &lt;reid201711@gmail.com&gt;
Signed-off-by: Mu Huai &lt;tianbowen.tbw@antgroup.com&gt;
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
@@ -44,6 +44,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves three main steps:
@@ -86,20 +92,14 @@ recipe = QuantizationModifier(
 # Apply the quantization algorithm.
 oneshot(model=model, recipe=recipe)
 
-# Save the model.
+# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
 ```
 
 ### 3. Evaluating Accuracy
 
-Install `vllm` and `lm-evaluation-harness`:
-
-```console
-pip install vllm lm-eval==0.4.4
-```
-
 Load and run the model in `vllm`:
 
 ```python
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
@@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -87,7 +93,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
@@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -91,7 +97,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md
@@ -126,7 +126,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save quantized model
+# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
@@ -19,6 +19,12 @@ pip install amd-quark
 You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
 for more installation details.
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 After installing Quark, we will use an example to illustrate how to use Quark.  
@@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
 export_config = ExporterConfig(json_export_config=JsonExporterConfig())
 export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
 
+# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
 EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
 exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
 with torch.no_grad():

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ oneshot(`
`126`	`126`	`num_calibration_samples=NUM_CALIBRATION_SAMPLES,`
`127`	`127`	`)`
`128`	`128`
`129`		`-# Save quantized model`
	`129`	`+# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV`
`130`	`130`	`SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"`
`131`	`131`	`model.save_pretrained(SAVE_DIR, save_compressed=True)`
`132`	`132`	`tokenizer.save_pretrained(SAVE_DIR)`