diff --git a/README.md b/README.md index bcf0e181d..2232fd445 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,24 @@ The install script installs system dependencies along with torchforge. Note that Optional: By default, the packages installation uses conda. If you want to install system packages on the target machine instead of conda, you can pass the `--use-sudo` flag to the installation script: `./scripts/install.sh --use-sudo`. +### XPU Installation + +XPU (Intel GPU) users can install with the dedicated script: + +```bash +conda create -n forge python=3.12 +conda activate forge +./scripts/install_xpu.sh +``` + +Notes: +- Requires Intel oneAPI toolkit installed at `$ONEAPI_ROOT`, `/opt/intel/oneapi`, or loadable via `module load intel/oneapi`. +- Python version must match `XPU_PYTHON_VERSION` in `assets/versions.sh`. +- XPU build installs Monarch with `USE_TENSOR_ENGINE=0`, so RDMA and distributed tensor features are disabled for now. +- Optional flag: `--use-sudo` (system packages via `apt`/`dnf` instead of conda). +- Re-activate your conda environment after install to pick up the oneAPI activation hook. + + ### Pixi Pixi combines benefits of uv with access to conda forge for system dependencies. [pixi.toml](./pixi.toml) provides a manifest with build tasks with `install` as a the combined install all task. diff --git a/assets/versions.sh b/assets/versions.sh index 120021f40..5a828d5e3 100644 --- a/assets/versions.sh +++ b/assets/versions.sh @@ -9,9 +9,14 @@ # Stable versions of upstream libraries for OSS repo PYTORCH_VERSION="2.9.0" -# ROCm builds vLLM from source (no prebuilt ROCm wheels available) +# ROCm/XPU builds vLLM from source (no prebuilt ROCm/XPU wheels available) VLLM_ROCM_VERSION="v0.10.0" +VLLM_XPU_VERSION="v0.17.0" +# PyTorch XPU version (vLLM v0.16+ dropped IPEX in favour of native XPU support) +PYTORCH_XPU_VERSION="2.10.0" +# vllm-xpu-kernels wheels only ship for Python 3.12 +XPU_PYTHON_VERSION="3.12" TORCHSTORE_BRANCH="no-monarch-2026.01.05" -# ROCm install builds these from source (no ROCm wheels); CUDA uses pyproject pins. +# ROCm/XPU builds these from source (no ROCm/XPU wheels); CUDA uses pyproject pins. TORCHTITAN_VERSION="v0.2.0" MONARCH_VERSION="v0.2.0" diff --git a/scripts/install_xpu.sh b/scripts/install_xpu.sh new file mode 100644 index 000000000..e7f13c914 --- /dev/null +++ b/scripts/install_xpu.sh @@ -0,0 +1,537 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +# Colors for output +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[0;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +VERSIONS_FILE="$REPO_ROOT/assets/versions.sh" +PYPROJECT_FILE="$REPO_ROOT/pyproject.toml" + +if [ ! -f "$VERSIONS_FILE" ]; then + log_error "Versions file not found: $VERSIONS_FILE" + exit 1 +fi + +source "$VERSIONS_FILE" + +# Validate required variables are set +if [ -z "${VLLM_XPU_VERSION:-}" ]; then + log_error "VLLM_XPU_VERSION not set in $VERSIONS_FILE" + exit 1 +fi +if [ -z "${TORCHSTORE_BRANCH:-}" ]; then + log_error "TORCHSTORE_BRANCH not set in $VERSIONS_FILE" + exit 1 +fi +if [ -z "${TORCHTITAN_VERSION:-}" ]; then + log_error "TORCHTITAN_VERSION not set in $VERSIONS_FILE" + exit 1 +fi +if [ -z "${MONARCH_VERSION:-}" ]; then + log_error "MONARCH_VERSION not set in $VERSIONS_FILE" + exit 1 +fi +if [ -z "${PYTORCH_XPU_VERSION:-}" ]; then + log_error "PYTORCH_XPU_VERSION not set in $VERSIONS_FILE" + exit 1 +fi + +# Defaults (override via environment variables) +FORGE_DEPS_DIR="${FORGE_DEPS_DIR:-$HOME/.cache/torchforge}" + +# Check conda environment +check_conda_env() { + if [ -z "${CONDA_DEFAULT_ENV:-}" ]; then + log_error "Not running in a conda environment" + log_info "Please create and activate your conda environment first:" + log_info " conda create -n forge python=3.12 -y" + log_info " conda activate forge" + exit 1 + fi + log_info "Installing in conda environment: $CONDA_DEFAULT_ENV" +} + +check_python_version() { + local required="$XPU_PYTHON_VERSION" + local actual + actual=$(python -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") + + if [ "$actual" != "$required" ]; then + log_error "Python ${actual} detected, but vllm-xpu-kernels requires Python ${required}" + log_info "Recreate your conda env with the correct version:" + log_info " conda create -n forge python=${required} -y" + exit 1 + fi + log_info "Python version ${actual} matches XPU requirement" +} + +# Check required command +check_command() { + if ! command -v "$1" >/dev/null 2>&1; then + log_error "Required command '$1' not found" + exit 1 + fi +} + +# Check sudo access and if it is not available; continue with Conda +check_sudo() { + if ! sudo -n true 2>/dev/null; then + log_warning "Passwordless sudo access is not available." + log_info "The script will continue and attempt to install packages via conda instead." + else + log_info "Passwordless sudo access detected." + fi +} + +# Detect OS distribution from /etc/os-release +detect_os_family() { + if [ ! -f /etc/os-release ]; then + log_error "/etc/os-release not found. Cannot determine OS distribution." + return 1 + fi + + # Source the os-release file to get variables + . /etc/os-release + + # Check ID_LIKE field for supported distributions + case "${ID_LIKE:-}" in + *"rhel"*|*"fedora"*) + echo "rhel_fedora" + ;; + *"debian"*) + echo "debian" + ;; + *) + # Fallback to ID if ID_LIKE is not set or doesn't match + case "${ID:-}" in + "rhel"|"fedora"|"centos"|"rocky"|"almalinux") + echo "rhel_fedora" + ;; + "debian"|"ubuntu") + echo "debian" + ;; + *) + echo "unknown" + ;; + esac + ;; + esac +} + +# Install required system packages +install_system_packages() { + local use_sudo=${1:-false} + + log_info "Installing required system packages..." + + if [ "$use_sudo" = "true" ]; then + # User explicitly requested sudo installation + if sudo -n true 2>/dev/null; then + # Detect OS family using /etc/os-release + local os_family + os_family=$(detect_os_family) + + case "$os_family" in + "rhel_fedora") + log_info "Detected RHEL/Fedora-based OS - using system package manager" + sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel \ + libunwind libunwind-devel clang protobuf-compiler + ;; + "debian") + log_info "Detected Debian-based OS - using system package manager" + sudo apt-get update + sudo apt-get install -y libibverbs1 rdma-core libmlx5-1 libibverbs-dev rdma-core-dev \ + libunwind-dev clang protobuf-compiler + ;; + "unknown") + log_error "Unsupported OS for automatic system package installation" + log_info "Supported distributions: RHEL/Fedora-based (rhel fedora) and Debian-based (debian)" + exit 1 + ;; + esac + log_info "System packages installed successfully via system package manager" + else + log_error "Sudo installation requested but no sudo access available" + log_info "Either run with sudo privileges or remove the --use-sudo flag to use conda" + exit 1 + fi + else + # Default to conda installation + log_info "Installing system packages via conda (default method)" + conda install -c conda-forge rdma-core libibverbs-cos7-x86_64 libunwind clang libprotobuf -y + log_info "Conda package installation completed. Packages installed in conda environment." + fi +} + +setup_xpu_env() { + local conda_env_dir="${CONDA_PREFIX}" + + if [ -z "$conda_env_dir" ]; then + log_error "Could not determine conda environment directory" + exit 1 + fi + + mkdir -p "${conda_env_dir}/etc/conda/activate.d" + + cat > "${conda_env_dir}/etc/conda/activate.d/xpu_env.sh" << 'EOF' +# Source oneAPI if not already active +if [ -z "${CMPLR_ROOT:-}" ] && [ -z "${MKLROOT:-}" ]; then + if [ -n "${ONEAPI_ROOT:-}" ] && [ -f "${ONEAPI_ROOT}/setvars.sh" ]; then + source "${ONEAPI_ROOT}/setvars.sh" --force 2>/dev/null || true + elif [ -f /opt/intel/oneapi/setvars.sh ]; then + source /opt/intel/oneapi/setvars.sh --force 2>/dev/null || true + fi +fi +EOF + + # Source for current session + # shellcheck source=/dev/null + set +euo pipefail + source "${conda_env_dir}/etc/conda/activate.d/xpu_env.sh" + set -euo pipefail + + # Validate oneAPI is now available + if [ -z "${CMPLR_ROOT:-}" ] && [ -z "${MKLROOT:-}" ]; then + # Check module system as fallback + if command -v module >/dev/null 2>&1 && module list 2>&1 | grep -qi "oneapi\|intel"; then + log_info "oneAPI loaded via module system" + else + log_error "Intel oneAPI not found after sourcing activation script" + log_info "Expected locations:" + log_info " \$ONEAPI_ROOT/setvars.sh" + log_info " /opt/intel/oneapi/setvars.sh" + log_info "Or load via: module load intel/oneapi" + exit 1 + fi + else + log_info "oneAPI environment active (CMPLR_ROOT or MKLROOT set)" + fi + + log_info "XPU conda activation hook installed" +} + +ensure_repo() { + local repo_url=$1 + local dest=$2 + local ref=$3 + + if [ ! -d "$dest/.git" ]; then + log_info "Cloning $repo_url into $dest" + git clone "$repo_url" "$dest" + else + log_info "Reusing existing repo at $dest" + fi + + git -C "$dest" fetch origin --tags + if [ -n "$ref" ]; then + git -C "$dest" checkout "$ref" + fi +} + +ensure_rust() { + if ! command -v rustup >/dev/null 2>&1; then + log_info "rustup not found; installing rustup" + check_command curl + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + fi + + if [ -f "$HOME/.cargo/env" ]; then + # shellcheck disable=SC1090 + source "$HOME/.cargo/env" + fi + + log_info "Ensuring Rust nightly toolchain" + rustup toolchain install nightly + rustup default nightly +} + +create_constraints_file() { + local torch_version + torch_version=$(python -c "import torch; print(torch.__version__)") + + local constraints_file="${FORGE_DEPS_DIR}/constraints.txt" + cat > "$constraints_file" <!~ \\[]", req, 1)[0].strip() + deps = [d for d in deps if name_of(d) not in skip] +elif dep_kind == "dev": + deps = data.get("project", {}).get("optional-dependencies", {}).get("dev", []) +else: + raise SystemExit(f"Unknown dep kind: {dep_kind}") + +if deps: + print("\n".join(deps)) +PY +); then + log_warning "Failed to parse pyproject.toml; installing tomli and retrying" + python -m pip install tomli + output=$(DEP_KIND="$dep_kind" PYPROJECT_FILE="$PYPROJECT_FILE" python - <<'PY' +import os +import re +from pathlib import Path + +import tomli as tomllib + +dep_kind = os.environ["DEP_KIND"] +pyproject_file = Path(os.environ["PYPROJECT_FILE"]) +data = tomllib.loads(pyproject_file.read_text()) + +deps = [] +if dep_kind == "base": + deps = data.get("project", {}).get("dependencies", []) + skip = { + "torch", + "vllm", + "torchstore", + "torchtitan", + "torchmonarch", + } + def name_of(req): + return re.split(r"[<=>!~ \\[]", req, 1)[0].strip() + deps = [d for d in deps if name_of(d) not in skip] +elif dep_kind == "dev": + deps = data.get("project", {}).get("optional-dependencies", {}).get("dev", []) +else: + raise SystemExit(f"Unknown dep kind: {dep_kind}") + +if deps: + print("\n".join(deps)) +PY +) + fi + + if [ -n "$output" ]; then + printf '%s\n' "$output" + fi +} + +install_forge() { + log_info "Installing Forge from source (no deps)" + python -m pip install -e "${REPO_ROOT}[dev]" --no-deps + + log_info "Installing Forge dependencies from pyproject.toml" + # XPU avoids CUDA-only pins like torchmonarch-nightly by installing deps explicitly. + readarray -t base_deps < <(read_project_deps base) + if [ "${#base_deps[@]}" -gt 0 ]; then + python -m pip install "${base_deps[@]}" + fi + + readarray -t dev_deps < <(read_project_deps dev) + if [ "${#dev_deps[@]}" -gt 0 ]; then + python -m pip install "${dev_deps[@]}" + fi +} + +# Parse command line arguments +parse_args() { + USE_SUDO=false + + while [[ $# -gt 0 ]]; do + case $1 in + --use-sudo) + USE_SUDO=true + shift + ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --use-sudo Use system package manager instead of conda for system packages" + echo " -h, --help Show this help message" + echo "" + echo "By default, system packages are installed via conda for better isolation." + exit 0 + ;; + *) + log_error "Unknown option: $1" + log_info "Use --help for usage information" + exit 1 + ;; + esac + done +} + +main() { + # Parse command line arguments first + parse_args "$@" + + echo "Forge XPU Installation" + echo "=======================" + echo "" + echo "Note: Run this from the root of the torchforge repository" + if [ "$USE_SUDO" = "true" ]; then + echo "System packages will be installed via system package manager (requires sudo)" + check_sudo + else + echo "System packages will be installed via conda (default, safer)" + fi + echo "" + + check_conda_env + check_python_version + check_command git + check_command python + check_command pip + check_command conda + + mkdir -p "$FORGE_DEPS_DIR" + + # Install build prerequisites + install_system_packages "$USE_SUDO" + setup_xpu_env + + # vLLM installs PyTorch + triton-xpu, fixes triton conflict, creates constraints + install_vllm_xpu + verify_pytorch_xpu + + # Everything below is protected by PIP_CONSTRAINT + install_torchstore + install_torchtitan + ensure_rust + install_monarch + install_forge + + # Test installation + log_info "Testing installation..." + python -c "import torch; print(f'PyTorch {torch.__version__} (XPU: {torch.xpu.is_available()})')" + python -c "import vllm; print('vLLM imported successfully')" + + # Test other imports if possible + if python -c "import torchtitan" 2>/dev/null; then + echo "torchtitan imported successfully" + fi + if python -c "import monarch" 2>/dev/null; then + echo "monarch imported successfully" + fi + if python -c "import forge" 2>/dev/null; then + echo "forge imported successfully" + fi + + echo "" + log_info "Installation completed successfully!" + echo "" + log_info "Re-activate the conda environment to make the changes take effect:" + log_info " conda deactivate && conda activate $CONDA_DEFAULT_ENV" +} + +main "$@" diff --git a/src/forge/actors/vllm/v1/generator.py b/src/forge/actors/vllm/v1/generator.py index ea7e0326a..97a339694 100644 --- a/src/forge/actors/vllm/v1/generator.py +++ b/src/forge/actors/vllm/v1/generator.py @@ -242,6 +242,21 @@ async def setup(self, host_mesh, worker_registry, gpu_ids: list[str]): "forge.actors.vllm.v1.forge_executor.ForgeMonarchExecutor" ) + # Disable vLLM's async scheduling for our custom executor backend. + # vLLM's __post_init__ is called twice: once at VllmConfig construction + # and again after EngineCore handshake (_perform_handshakes). In vLLM + # >= 0.14, async_scheduling defaults to None (auto-detect), which the + # first __post_init__ auto-enables to True since executor is still "mp". + # After we override the executor backend above, the second __post_init__ + # sees async_scheduling=True with an unrecognized backend and raises + # ValueError. Setting False explicitly is safe for all vLLM versions: + # in <= 0.13 it was already the default, and our MonarchExecutor does + # not use vLLM's async scheduling mechanism. + if hasattr(self.vllm_config, "scheduler_config") and hasattr( + self.vllm_config.scheduler_config, "async_scheduling" + ): + self.vllm_config.scheduler_config.async_scheduling = False + # Set up prefetching configuration via additional_config # There does not seem to be a real difference between pass by env var or via self.vllm_config if self.prefetch_weights_to_shm: diff --git a/src/forge/actors/vllm/v1/monarch_executor.py b/src/forge/actors/vllm/v1/monarch_executor.py index b42da2876..8c015e031 100644 --- a/src/forge/actors/vllm/v1/monarch_executor.py +++ b/src/forge/actors/vllm/v1/monarch_executor.py @@ -9,8 +9,10 @@ from __future__ import annotations import base64 +import inspect import logging import os +from functools import partial from typing import Any, Callable, Dict, List, Optional, Tuple, Union import cloudpickle @@ -211,11 +213,24 @@ class WorkerWrapper(WorkerWrapperBase, Actor): stores). """ + # Detect whether WorkerWrapperBase accepts vllm_config (vLLM <= 0.13) + # or only rpc_rank/global_rank (vLLM >= 0.14). + _wrapper_accepts_vllm_config: bool = ( + "vllm_config" in inspect.signature(WorkerWrapperBase.__init__).parameters + ) + def __init__(self, vllm_config): rank = context().actor_instance.rank.rank # rpc_rank: rank within this executor (0 to num_workers-1) # global_rank: rank in distributed group (same as rpc_rank for single executor) - WorkerWrapperBase.__init__(self, vllm_config, rpc_rank=rank, global_rank=rank) + if self._wrapper_accepts_vllm_config: + # vLLM <= 0.13: vllm_config passed at wrapper init time + WorkerWrapperBase.__init__( + self, vllm_config, rpc_rank=rank, global_rank=rank + ) + else: + # vLLM >= 0.14: vllm_config flows through init_worker(all_kwargs) + WorkerWrapperBase.__init__(self, rpc_rank=rank, global_rank=rank) Actor.__init__(self) def init_worker(self, all_kwargs): @@ -234,9 +249,15 @@ def init_worker(self, all_kwargs): super().init_worker(all_kwargs) @endpoint - def execute_method(self, method: str, *args, **kwargs): - # For simplicity, we only support string method names for now - fn = getattr(self, method) + def execute_method(self, method, *args, **kwargs): + # Support both string method names and bytes (cloudpickle'd callables, + # used by vLLM >= 0.17 for lambda-based collective_rpc calls). + if isinstance(method, bytes): + fn = partial(cloudpickle.loads(method), self) + elif isinstance(method, str): + fn = getattr(self, method) + else: + fn = partial(method, self) return fn(*args, **kwargs) @endpoint