Skip to content

Update pre-commit hooks #12475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI
repos:
- repo: https://github.com/google/yapf
rev: v0.32.0
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
Expand All @@ -23,7 +23,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
Expand All @@ -35,7 +35,7 @@ repos:
- id: pymarkdown
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.6
rev: v1.7.7
hooks:
- id: actionlint
- repo: local
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
)

# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency

Expand Down
8 changes: 6 additions & 2 deletions csrc/custom_all_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,13 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};

struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};

struct __align__(16) RankSignals { Signal* signals[8]; };
struct __align__(16) RankSignals {
Signal* signals[8];
};

// like std::array, but aligned
template <typename T, int sz>
Expand Down
8 changes: 4 additions & 4 deletions csrc/moe/marlin_kernels/marlin_moe_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
Comment on lines -141 to +142
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree personally, but will accept it 🤕

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's as if the new version of clang-format is treating the template as comparison operators for the purpose of formatting 😕

// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down Expand Up @@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);

const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
Expand Down
16 changes: 8 additions & 8 deletions csrc/quantization/gptq_marlin/gptq_marlin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand All @@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {

// Guarantee that the `(a & b) | c` operations are LOP3s.

int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);

typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
Expand All @@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);

const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
Expand All @@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {

// Guarantee that the `(a & b) | c` operations are LOP3s.

int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);

typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static constexpr uint32_t HI = 0x00f000f0;
static constexpr uint32_t EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static constexpr uint32_t SUB = 0x64086408;
Expand Down
4 changes: 2 additions & 2 deletions csrc/quantization/marlin/sparse/common/mma.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
Expand Down
4 changes: 3 additions & 1 deletion csrc/rocm/attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions){UNREACHABLE_CODE}
const int max_num_partitions) {
UNREACHABLE_CODE
}

#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def get_rocm_version():

if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0):
return "%d.%d.%d" % (major.value, minor.value, patch.value)
return f"{major.value}.{minor.value}.{patch.value}"
return None
except Exception:
return None
Expand Down
25 changes: 14 additions & 11 deletions tests/kernels/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
]
B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
B[
j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)]
C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
Expand Down Expand Up @@ -157,9 +159,9 @@ def setup_cuda():
torch.set_default_device("cuda")


@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
SEEDS))
@pytest.mark.parametrize(
"num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
@torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed)
Expand All @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert torch.allclose(scale, ref_scale)


@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
SEEDS))
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed)
Expand Down Expand Up @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001


@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
BLOCK_SIZE, DTYPES, SEEDS))
@pytest.mark.parametrize(
"M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed)
Expand Down
10 changes: 5 additions & 5 deletions tests/kv_transfer/test_lookup_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("My rank: %d, device: %s" % (my_rank, device))
print(f"My rank: {my_rank}, device: {device}")

# insert
tokens = torch.tensor([1, 2, 3]).to(device)
Expand Down Expand Up @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("My rank: %d, Test run passed!" % (my_rank))
print(f"My rank: {my_rank}, Test run passed!")


def stress_test(my_rank, buf, device):
Expand Down Expand Up @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_)
assert torch.allclose(v, v_)
assert torch.allclose(h, h_)
print('Rank %d done' % my_rank)
print(f"Rank {my_rank} done")
torch.distributed.barrier()

if my_rank == 0:
Expand All @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else:
torch.distributed.send(torch.tensor([n]), 0)

print("My rank: %d, Passed stress test!" % (my_rank))
print(f"My rank: {my_rank}, Passed stress test!")


if __name__ == "__main__":
Expand All @@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device):
rank=my_rank,
)

print("initialized! My rank is %d" % my_rank)
print(f"initialized! My rank is {my_rank}")

config = KVTransferConfig(
kv_connector='PyNcclConnector',
Expand Down
6 changes: 3 additions & 3 deletions tests/lora/test_qwen2vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts


@pytest.mark.xfail(current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm"
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
Expand Down
Loading
Loading