From 816416e6dcc758891dfb658be9b0d674afba4c2d Mon Sep 17 00:00:00 2001 From: joelteply Date: Sat, 6 Jun 2026 19:40:57 -0500 Subject: [PATCH] =?UTF-8?q?continuum:=20explicit=20Metal=20device=20select?= =?UTF-8?q?ion=20=E2=80=94=20avoid=20hang=20in=20AMD=20Polaris=20lazy=20in?= =?UTF-8?q?it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ggml_metal_device_init` calls `MTLCreateSystemDefaultDevice()` which on Intel Macs returns the discrete GPU. On MacBookPro15,1 (Radeon Pro 560X, Polaris, macOS 15.7.7), the first `newCommandQueue` call enters `amdMtlBronzeLazyInit` → `amdMtlAllocateBuffer` → `IOAccelResourceCreate` → `IOConnectCallMethod` → `mach_msg2_trap` and never returns. Because GGML's Metal backend registers via a global static initializer (`ggml_backend_registry::ggml_backend_registry`), this hangs at process startup — every llama.cpp consumer on this hardware deadlocks before arg parsing. Captured via `sample(1)` of the live hung process: main → common_params_parser_init → ggml_backend_registry::ggml_backend_registry() → ggml_backend_metal_reg (ggml-metal.cpp:924) → ggml_metal_device_init (ggml-metal-device.m:641) → -[BronzeMtlDevice newCommandQueue] → -[BronzeMtlDevice amdMtlBronzeLazyInit] → IOAccelResourceCreate → mach_msg2_trap ← HANG Fix: enumerate via `MTLCopyAllDevices()` and select with priority: 1. Apple Silicon (always best) 2. Integrated low-power (Intel UHD/Iris on Intel Mac — substrate LCD floor; hundreds of millions of Intel Macs ship with one) 3. External / eGPU 4. Discrete (last; may hang on Polaris-era AMD) Operator can override via `GGML_METAL_DEVICE_NAME=` env var. Logs all enumerated devices with `lowPower`/`removable`/`location` properties to give operators visibility into what was selected and why. Tested on MacBookPro15,1 / macOS 15.7.7: - Before: `--help` hung forever (sample captured the AMD mach_msg2_trap) - After: `--help` returns immediately; `llama-cli` loads Qwen 2.5 0.5B on Intel UHD 630, runs inference at 22.5 tok/s Known follow-up: Metal q4_k matmul kernel produces incorrect output on Intel UHD 630 (output: "antity@@@@@@@" instead of "Hello! How can..."), likely the matrix-vector fallback that runs when `has_simdgroup_mm=false`. Carded separately; not blocking the hang fix. Tested: macOS 15.7.7, Xcode 26.3 (macOS 26.2 SDK), MacBookPro15,1 (Coffee Lake + Intel UHD 630 + AMD Radeon Pro 560X). Card: 346b356f. --- ggml/src/ggml-metal/ggml-metal-device.m | 85 ++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index eabb2ca03496..62403f040665 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -635,7 +635,90 @@ ggml_metal_device_t ggml_metal_device_init(int device) { assert(dev != NULL); if (dev->mtl_device == nil) { - dev->mtl_device = MTLCreateSystemDefaultDevice(); + // Card 346b356f: don't blindly use MTLCreateSystemDefaultDevice(). + // On Intel Macs the system default is often the discrete GPU + // (e.g. Radeon Pro 560X / Vega 16/20), and on certain hardware + // (Polaris-class AMD on macOS 15.x) `newCommandQueue` enters + // amdMtlBronzeLazyInit → IOAccelResourceCreate → mach_msg2_trap + // and never returns — the ggml_metal_device_init hang task #131 + // captured on MacBookPro15,1 / Radeon Pro 560X / macOS 15.7.7. + // + // Strategy: enumerate via MTLCopyAllDevices() and prefer in order: + // 1. Apple Silicon (best perf; always works) + // 2. Integrated low-power GPU (Intel UHD/Iris on Intel Mac) — + // hundreds of millions of Intel Macs ship with one; LCD floor + // 3. External / eGPU (rare but valid) + // 4. Discrete (last; known to hang on some Polaris-era AMD) + // Env var GGML_METAL_DEVICE_NAME= overrides for ops who + // want to force a specific device by name match. + const char * device_name_hint = getenv("GGML_METAL_DEVICE_NAME"); +#if TARGET_OS_OSX + NSArray> * all_devices = MTLCopyAllDevices(); +#else + NSArray> * all_devices = @[ MTLCreateSystemDefaultDevice() ]; +#endif + if ([all_devices count] == 0) { + GGML_LOG_ERROR("%s: error: no Metal devices available\n", __func__); + } else { + GGML_LOG_INFO("%s: enumerated %lu Metal device(s)\n", __func__, (unsigned long) [all_devices count]); + for (id d in all_devices) { + GGML_LOG_INFO("%s: - %s (lowPower=%d, removable=%d, location=%ld)\n", + __func__, [[d name] UTF8String], + (int) d.lowPower, (int) d.removable, (long) d.location); + } + // 0) Operator override + if (device_name_hint && device_name_hint[0] != '\0') { + NSString * hint = [NSString stringWithUTF8String:device_name_hint]; + for (id d in all_devices) { + if ([[d name] containsString:hint]) { + dev->mtl_device = [d retain]; + GGML_LOG_INFO("%s: selected via GGML_METAL_DEVICE_NAME hint: %s\n", + __func__, [[d name] UTF8String]); + break; + } + } + } + // 1) Apple Silicon + if (dev->mtl_device == nil) { + for (id d in all_devices) { + if ([d supportsFamily:MTLGPUFamilyApple1]) { + dev->mtl_device = [d retain]; + GGML_LOG_INFO("%s: selected Apple Silicon device: %s\n", + __func__, [[d name] UTF8String]); + break; + } + } + } + // 2) Integrated low-power (Intel UHD/Iris on Intel Mac — substrate LCD floor) + if (dev->mtl_device == nil) { + for (id d in all_devices) { + if (d.lowPower) { + dev->mtl_device = [d retain]; + GGML_LOG_INFO("%s: selected integrated low-power device: %s\n", + __func__, [[d name] UTF8String]); + break; + } + } + } + // 3) External / eGPU + if (dev->mtl_device == nil) { + for (id d in all_devices) { + if (d.location == MTLDeviceLocationExternal) { + dev->mtl_device = [d retain]; + GGML_LOG_INFO("%s: selected external device: %s\n", + __func__, [[d name] UTF8String]); + break; + } + } + } + // 4) Discrete (last resort — may hang on Polaris-era AMD, see card 346b356f) + if (dev->mtl_device == nil) { + dev->mtl_device = [[all_devices objectAtIndex:0] retain]; + GGML_LOG_WARN("%s: falling back to first available device (may be discrete AMD): %s\n", + __func__, [[dev->mtl_device name] UTF8String]); + } + } + [all_devices release]; if (dev->mtl_device) { dev->mtl_queue = [dev->mtl_device newCommandQueue];