pytorch
diff --git a/‎.github/workflows/build_wheels_linux.yml
+2 b/‎.github/workflows/build_wheels_linux.yml
+2
diff --git a/‎.github/workflows/float8_test.yml
+1-3 b/‎.github/workflows/float8_test.yml
+1-3
diff --git a/‎.github/workflows/nightly_smoke_test.yml
+1-1 b/‎.github/workflows/nightly_smoke_test.yml
+1-1
diff --git a/‎.github/workflows/regression_test.yml
+32-10 b/‎.github/workflows/regression_test.yml
+32-10
diff --git a/‎.gitignore
+4-1 b/‎.gitignore
+4-1
diff --git a/‎README.md
+4-3 b/‎README.md
+4-3
diff --git a/‎benchmarks/benchmark_low_bit_adam.py
+63-25 b/‎benchmarks/benchmark_low_bit_adam.py
+63-25
@@ -43,6 +43,8 @@ jobs:
       # triggered daily from main with a schedule
       repository: pytorch/ao
       ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       env-var-script: packaging/env_var_script_linux.sh
       pre-script: packaging/pre_build_script.sh
 
@@ -29,7 +29,7 @@ jobs:
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
 
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
       runner: ${{ matrix.runs-on }}
@@ -38,8 +38,6 @@ jobs:
       script: |
         conda create -n venv python=3.9 -y
         conda activate venv
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
 
@@ -26,7 +26,7 @@ jobs:
             gpu-arch-version: "12.1"
 
 
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: ${{ matrix.runs-on }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
 
@@ -18,6 +18,38 @@ env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
 jobs:
+  test-nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: CUDA Nightly
+            runs-on: linux.g5.12xlarge.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
+          - name: CPU Nightly
+            runs-on: linux.4xlarge
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
+            gpu-arch-type: "cpu"
+            gpu-arch-version: ""
+
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 120
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        python -m pip install --upgrade pip
+        pip install ${{ matrix.torch-spec }}
+        pip install -r dev-requirements.txt
+        pip install .
+        export CONDA=$(dirname $(dirname $(which conda)))
+        export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
+        pytest test --verbose -s
   test:
     strategy:
       fail-fast: false
@@ -38,11 +70,6 @@ jobs:
             torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
-          - name: CUDA Nightly
-            runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cu121'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.1"
 
           - name: CPU 2.3
             runs-on: linux.4xlarge
@@ -59,11 +86,6 @@ jobs:
             torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-          - name: CPU Nightly
-            runs-on: linux.4xlarge
-            torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cpu'
-            gpu-arch-type: "cpu"
-            gpu-arch-version: ""
 
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
 
@@ -371,4 +371,7 @@ venv/
 sweep/
 
 # Model checkpoints
-checkpoints/
+checkpoints/
+
+# Experimental
+torchao/experimental/cmake-out
@@ -177,8 +177,8 @@ We're also fortunate to be integrated into some of the leading open-source libra
 2. Hugging Face diffusers best practices with torch.compile and torchao in a standalone repo [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
 3. Mobius HQQ backend leveraged our int4 kernels to get [195 tok/s on a 4090](https://github.com/mobiusml/hqq#faster-inference)
 4. [TorchTune](https://github.com/pytorch/torchtune) for our QLoRA and QAT recipes
-5. [torchchat](https://github.com/pytorch/torchtune) for post training quantization
-6. [SGLang](https://github.com/sgl-project/sglang/pull/1341) for LLM inference quantization
+5. [torchchat](https://github.com/pytorch/torchchat) for post training quantization
+6. SGLang for LLM serving: [usage](https://github.com/sgl-project/sglang/blob/4f2ee48ed1c66ee0e189daa4120581de324ee814/docs/backend/backend.md?plain=1#L83) and the major [PR](https://github.com/sgl-project/sglang/pull/1341).
 
 ## Videos
 * [Keynote talk at GPU MODE IRL](https://youtu.be/FH5wiwOyPX4?si=VZK22hHz25GRzBG1&t=1009)
@@ -201,8 +201,9 @@ If you find the torchao library useful, please cite it in your work as below.
 @software{torchao,
   title = {torchao: PyTorch native quantization and sparsity for training and inference},
   author = {torchao maintainers and contributors},
-  url = {https//github.com/pytorch/torchao},
+  url = {https://github.com/pytorch/torchao},
   license = {BSD-3-Clause},
   month = oct,
   year = {2024}
+}
 ```
@@ -4,7 +4,7 @@
 # - lpmm (4-bit optim): pip install yacs git+https://github.com/thu-ml/low-bit-optimizers.git
 # - DeepSpeed (ZeRO-Offload):
 #     sudo apt install libopenmpi-dev
-#     LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu pip install mpi4p
+#     LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu pip install mpi4py
 #     DS_BUILD_CPU_ADAM=1 pip install deepspeed --no-cache-dir
 #
 # To fine-tune a pre-trained ViT-Base on resisc45 dataset with BF16 AMP, using default AdamW optimizer from PyTorch core
@@ -31,11 +31,15 @@
 import torch.nn.functional as F
 import wandb
 from torch.utils.data import DataLoader
+from torchao.utils import get_available_devices
 from torchvision.transforms import v2
 from tqdm import tqdm
 
 from torchao.prototype import low_bit_optim
 
+_DEVICE = get_available_devices()[-1]
+assert _DEVICE in ["cuda", "xpu"], "Benchmark currently only supports CUDA & XPU(BF16)"
+
 OPTIM_MAP = dict(
     AdamW=partial(torch.optim.AdamW, fused=True),
     AdamW8bitBnb=bnb.optim.AdamW8bit,
@@ -49,7 +53,9 @@
 
     OPTIM_MAP.update(
         AdamW4bitLpmm=partial(lpmm.optim.AdamW, fused=True),
-        AdamW4bitRank1Lpmm=partial(lpmm.optim.AdamW, qconfig=argparse.Namespace(scale_type="rank1")),
+        AdamW4bitRank1Lpmm=partial(
+            lpmm.optim.AdamW, qconfig=argparse.Namespace(scale_type="rank1")
+        ),
     )
 
 except ImportError:
@@ -67,8 +73,12 @@ def get_lr(self, step: int) -> float:
         if step < self.warmup_steps:
             return self.lr * step / self.warmup_steps
         if step < self.total_steps:
-            progress = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
-            return self.final_lr + 0.5 * (self.lr - self.final_lr) * (1 + math.cos(progress * math.pi))
+            progress = (step - self.warmup_steps) / (
+                self.total_steps - self.warmup_steps
+            )
+            return self.final_lr + 0.5 * (self.lr - self.final_lr) * (
+                1 + math.cos(progress * math.pi)
+            )
         return self.final_lr
 
 
@@ -92,7 +102,9 @@ def get_parser():
     parser.add_argument("--weight_decay", type=float, default=0)
     parser.add_argument("--optim_kwargs", type=json.loads, default=dict())
     parser.add_argument("--cosine_lr_scheduler", action="store_true")
-    parser.add_argument("--optim_cpu_offload", choices=["ao", "ao_offload_grads", "deepspeed"])
+    parser.add_argument(
+        "--optim_cpu_offload", choices=["ao", "ao_offload_grads", "deepspeed"]
+    )
 
     parser.add_argument("--project")
     parser.add_argument("--run_name", default="debug")
@@ -110,11 +122,15 @@ def get_dloader(args, training: bool):
         transforms.extend([v2.Resize(256), v2.CenterCrop(224)])
 
     transforms.append(v2.ToDtype(torch.float32, scale=True))
-    transforms.append(v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+    transforms.append(
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    )
     transforms = v2.Compose(transforms)
 
     # use dataset from HF so download is fast
-    ds = datasets.load_dataset("timm/resisc45", split="train" if training else "validation")
+    ds = datasets.load_dataset(
+        "timm/resisc45", split="train" if training else "validation"
+    )
     ds = ds.select_columns(["image", "label"])
     ds.set_transform(lambda x: dict(image=transforms(x["image"]), label=x["label"]))
 
@@ -128,9 +144,9 @@ def get_dloader(args, training: bool):
     )
 
 
-def get_amp_ctx(amp):
+def get_amp_ctx(amp, device):
     dtype = dict(bf16=torch.bfloat16, fp16=torch.float16, none=None)[amp]
-    return torch.autocast("cuda", dtype=dtype, enabled=amp != "none")
+    return torch.autocast(device, dtype=dtype, enabled=amp != "none")
 
 
 @torch.no_grad()
@@ -148,8 +164,8 @@ def evaluate_model(model, args):
         if args.channels_last:
             batch["image"] = batch["image"].to(memory_format=torch.channels_last)
 
-        with get_amp_ctx(args.amp):
-            all_preds.append(model(batch["image"].cuda()).argmax(1).cpu())
+        with get_amp_ctx(args.amp, _DEVICE):
+            all_preds.append(model(batch["image"].to(_DEVICE)).argmax(1).cpu())
 
     all_labels = torch.cat(all_labels, dim=0)
     all_preds = torch.cat(all_preds, dim=0)
@@ -164,8 +180,12 @@ def evaluate_model(model, args):
     if args.full_bf16:
         assert args.amp == "none", "When --full_bf16 is set, --amp must be none"
     if args.optim_cpu_offload == "deepspeed":
-        assert args.amp == "none", "When using DeepSpeed ZeRO-Offload, --amp must be none"
-        assert args.optim == "AdamW", "When using DeepSpeed ZeRO-Offload, --optim must be AdamW"
+        assert (
+            args.amp == "none"
+        ), "When using DeepSpeed ZeRO-Offload, --amp must be none"
+        assert (
+            args.optim == "AdamW"
+        ), "When using DeepSpeed ZeRO-Offload, --optim must be AdamW"
     if args.profile:
         args.n_epochs = 1
     if args.seed is not None:
@@ -185,14 +205,16 @@ def evaluate_model(model, args):
     dloader = get_dloader(args, True)
     print(f"Train dataset: {len(dloader.dataset):,} images")
 
-    model = timm.create_model(args.model, pretrained=True, num_classes=45, **args.model_kwargs)
+    model = timm.create_model(
+        args.model, pretrained=True, num_classes=45, **args.model_kwargs
+    )
     if args.checkpoint_activations:
         model.set_grad_checkpointing()
     if args.full_bf16:
         model.bfloat16()
     if args.channels_last:
         model.to(memory_format=torch.channels_last)
-    model.cuda()  # move model to CUDA after optionally convert it to BF16
+    model.to(_DEVICE)  # move model to DEVICE after optionally convert it to BF16
     if args.compile:
         model.compile(fullgraph=True)
     print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
@@ -227,9 +249,15 @@ def evaluate_model(model, args):
         optim_cls = OPTIM_MAP[args.optim]
 
         if args.optim_cpu_offload == "ao":
-            optim_cls = partial(low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls)
+            optim_cls = partial(
+                low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls
+            )
         elif args.optim_cpu_offload == "ao_offload_grads":
-            optim_cls = partial(low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls, offload_gradients=True)
+            optim_cls = partial(
+                low_bit_optim.CPUOffloadOptimizer,
+                optimizer_class=optim_cls,
+                offload_gradients=True,
+            )
 
         optim = optim_cls(
             model.parameters(),
@@ -239,24 +267,30 @@ def evaluate_model(model, args):
         )
 
     lr_schedule = CosineSchedule(args.lr, len(dloader) * args.n_epochs)
-    grad_scaler = torch.amp.GradScaler("cuda", enabled=args.amp == "fp16")
+    grad_scaler = torch.amp.GradScaler(_DEVICE, enabled=args.amp == "fp16")
     log_interval = 10
     t0 = time.perf_counter()
 
     step = 0
     for epoch_idx in range(args.n_epochs):
         model.train()
-        pbar = tqdm(dloader, dynamic_ncols=True, desc=f"Epoch {epoch_idx + 1}/{args.n_epochs}")
+        pbar = tqdm(
+            dloader, dynamic_ncols=True, desc=f"Epoch {epoch_idx + 1}/{args.n_epochs}"
+        )
 
         with torch.profiler.profile() if args.profile else nullcontext() as prof:
             for batch in pbar:
                 if args.full_bf16:
                     batch["image"] = batch["image"].bfloat16()
                 if args.channels_last:
-                    batch["image"] = batch["image"].to(memory_format=torch.channels_last)
+                    batch["image"] = batch["image"].to(
+                        memory_format=torch.channels_last
+                    )
 
-                with get_amp_ctx(args.amp):
-                    loss = F.cross_entropy(model(batch["image"].cuda()), batch["label"].cuda())
+                with get_amp_ctx(args.amp, _DEVICE):
+                    loss = F.cross_entropy(
+                        model(batch["image"].to(_DEVICE)), batch["label"].to(_DEVICE)
+                    )
 
                 if args.optim_cpu_offload == "deepspeed":
                     model.backward(loss)
@@ -275,7 +309,9 @@ def evaluate_model(model, args):
                     log_dict = dict(loss=loss.item(), lr=optim.param_groups[0]["lr"])
                     if step > 0:
                         t1 = time.perf_counter()
-                        log_dict["imgs_per_second"] = args.batch_size * log_interval / (t1 - t0)
+                        log_dict["imgs_per_second"] = (
+                            args.batch_size * log_interval / (t1 - t0)
+                        )
                         t0 = t1
                     logger.log(log_dict, step=step)
 
@@ -296,9 +332,11 @@ def evaluate_model(model, args):
 
         else:
             val_acc = evaluate_model(model, args)
-            print(f"Epoch {epoch_idx + 1}/{args.n_epochs}: val_acc={val_acc.item() * 100:.2f}")
+            print(
+                f"Epoch {epoch_idx + 1}/{args.n_epochs}: val_acc={val_acc.item() * 100:.2f}"
+            )
             logger.log(dict(val_acc=val_acc), step=step)
 
-    peak_mem = torch.cuda.max_memory_allocated() / 1e9
+    peak_mem = getattr(torch, _DEVICE).max_memory_allocated() / 1e9
     print(f"Max memory used: {peak_mem:.02f} GB")
     logger.log(dict(max_memory_allocated=peak_mem))