#include "present/nvapi_d3d12.h" #include #include #include #include #include #include #include #include #include #include #include using Microsoft::WRL::ComPtr; namespace sauna { namespace { constexpr NvU32 kVendorId = 0x2709; // EDID "BIG" constexpr int kNumBuffers = 2; // Per-process GPU memory in MB (diagnostic for the M5 release/reacquire VRAM // leak). Cached adapter 0. 0 if unavailable. static uint64_t vramMB() { static Microsoft::WRL::ComPtr a; if (!a) { Microsoft::WRL::ComPtr f; if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&f)))) return 0; // Adapter with the most dedicated VRAM = the dGPU (DirectMode runs there). // EnumAdapters1(0) can be the iGPU on a hybrid box with a motherboard // display, whose LOCAL segment reads ~0 — blanking the counter. Microsoft::WRL::ComPtr best, a1; SIZE_T bestVram = 0; for (UINT i = 0; f->EnumAdapters1(i, &a1) != DXGI_ERROR_NOT_FOUND; i++) { DXGI_ADAPTER_DESC1 d{}; if (SUCCEEDED(a1->GetDesc1(&d)) && !(d.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) && d.DedicatedVideoMemory > bestVram) { bestVram = d.DedicatedVideoMemory; best = a1; } a1.Reset(); } if (!best) return 0; best.As(&a); } DXGI_QUERY_VIDEO_MEMORY_INFO info{}; if (a && SUCCEEDED(a->QueryVideoMemoryInfo( 0, DXGI_MEMORY_SEGMENT_GROUP_LOCAL, &info))) return info.CurrentUsage / (1024 * 1024); return 0; } #define NVFAIL(call) \ do { \ NvAPI_Status s_ = (call); \ if (s_ != NVAPI_OK) { \ fprintf(stderr, "Presenter: %s = 0x%x (line %d)\n", #call, s_, \ __LINE__); \ return false; \ } \ } while (0) #define HRFAIL(call) \ do { \ HRESULT h_ = (call); \ if (FAILED(h_)) { \ fprintf(stderr, "Presenter: %s = 0x%lx (line %d)\n", #call, h_, \ __LINE__); \ return false; \ } \ } while (0) // Pin the driver's power management for this exe to "prefer maximum // performance" via a DRS application profile (what SteamVR/games do). // M3 field finding: episodes of 8-20x render slowdown with NO fan ramp — // the P-state heuristic sees our ~4 ms/frame burst load as idle, drops // to idle clocks, the frame balloons to 100-200 ms (reads as rubber-band // tracking), then recovers. Best effort: failure only logs — the user // can set the same thing in the NVIDIA Control Panel. void ensureMaxPerfProfile() { NvDRSSessionHandle session{}; if (NvAPI_DRS_CreateSession(&session) != NVAPI_OK) return; bool saved = false; do { if (NvAPI_DRS_LoadSettings(session) != NVAPI_OK) break; const wchar_t exeName[] = L"spatial_light.exe"; NvDRSProfileHandle prof{}; NVDRS_APPLICATION app{}; app.version = NVDRS_APPLICATION_VER; if (NvAPI_DRS_FindApplicationByName(session, (NvU16*)exeName, &prof, &app) != NVAPI_OK) { NVDRS_PROFILE pr{}; pr.version = NVDRS_PROFILE_VER; wcscpy_s((wchar_t*)pr.profileName, NVAPI_UNICODE_STRING_MAX, L"sauna"); if (NvAPI_DRS_CreateProfile(session, &pr, &prof) != NVAPI_OK) break; NVDRS_APPLICATION na{}; na.version = NVDRS_APPLICATION_VER; wcscpy_s((wchar_t*)na.appName, NVAPI_UNICODE_STRING_MAX, (const wchar_t*)exeName); if (NvAPI_DRS_CreateApplication(session, prof, &na) != NVAPI_OK) break; } NVDRS_SETTING cur{}; cur.version = NVDRS_SETTING_VER; if (NvAPI_DRS_GetSetting(session, prof, PREFERRED_PSTATE_ID, &cur) == NVAPI_OK && cur.u32CurrentValue == PREFERRED_PSTATE_PREFER_MAX) { saved = true; // already pinned — don't churn the driver store break; } NVDRS_SETTING set{}; set.version = NVDRS_SETTING_VER; set.settingId = PREFERRED_PSTATE_ID; set.settingType = NVDRS_DWORD_TYPE; set.u32CurrentValue = PREFERRED_PSTATE_PREFER_MAX; if (NvAPI_DRS_SetSetting(session, prof, &set) != NVAPI_OK) break; if (NvAPI_DRS_SaveSettings(session) != NVAPI_OK) break; printf("driver profile: power management pinned to max performance " "(new — takes full effect from this or the next launch)\n"); saved = true; } while (false); if (!saved) fprintf(stderr, "driver profile: could not pin max performance — if tracking " "rubber-bands, set Power management mode = Prefer maximum " "performance for spatial_light.exe in NVIDIA Control Panel\n"); NvAPI_DRS_DestroySession(session); } // Current graphics-domain core clock, MHz (0 = unavailable). 1 Hz status // poll — cost is negligible. First physical GPU; v1 is single-GPU NVIDIA. int currentGpuClockMHz() { static NvPhysicalGpuHandle gpu{}; static bool tried = false; if (!tried) { tried = true; NvPhysicalGpuHandle h[NVAPI_MAX_PHYSICAL_GPUS]{}; NvU32 n = 0; if (NvAPI_EnumPhysicalGPUs(h, &n) == NVAPI_OK && n > 0) gpu = h[0]; } if (!gpu) return 0; NV_GPU_CLOCK_FREQUENCIES f{}; f.version = NV_GPU_CLOCK_FREQUENCIES_VER; f.ClockType = NV_GPU_CLOCK_FREQUENCIES_CURRENT_FREQ; if (NvAPI_GPU_GetAllClockFrequencies(gpu, &f) != NVAPI_OK) return 0; const auto& d = f.domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS]; return d.bIsPresent ? (int)(d.frequency / 1000) : 0; // kHz -> MHz } } // namespace struct NvapiD3d12Presenter::Impl { NV_DIRECT_MODE_DISPLAY_HANDLE display{}; NV_DIRECT_MODE_INFO mode{}; double modeHz = 75.0; ComPtr dev; ComPtr queue; ComPtr alloc; ComPtr list; ComPtr fence; HANDLE fenceEvt = nullptr; UINT64 fenceVal = 0; NV_DIRECT_MODE_SURFACE_HANDLE surf[kNumBuffers]{}; ComPtr res[kNumBuffers]; ComPtr rtvHeap; D3D12_CPU_DESCRIPTOR_HANDLE rtv[kNumBuffers]{}; DXGI_FORMAT rtvFormat = DXGI_FORMAT_UNKNOWN; UINT width = 0, height = 0; HANDLE waitEvt = nullptr; bool haveWaitable = false; bool acquired = false; mutable std::mutex statsMu; PresentStats stats; ~Impl() { if (acquired) { NvAPI_D3D_ReleaseDirectModeDisplay(kVendorId, &display); acquired = false; } if (fenceEvt) CloseHandle(fenceEvt); // NvAPI_Unload deliberately skipped (S2): unloading before D3D teardown // crashes in driver DLL atexit order. } }; NvapiD3d12Presenter::NvapiD3d12Presenter(const NvapiPresenterConfig& cfg) : impl_(new Impl), cfg_(cfg) {} NvapiD3d12Presenter::~NvapiD3d12Presenter() = default; bool NvapiD3d12Presenter::init() { Impl& im = *impl_; NVFAIL(NvAPI_Initialize()); // Driver version up front — fleet machines fail init in driver-dependent // ways (struct-version mismatches, DirectMode quirks); the bug report // should carry this without asking. { NvU32 ver = 0; NvAPI_ShortString branch{}; if (NvAPI_SYS_GetDriverAndBranchVersion(&ver, branch) == NVAPI_OK) printf("NVIDIA driver %u.%02u (%s)\n", ver / 100, ver % 100, branch); } ensureMaxPerfProfile(); NvAPI_Status st = NvAPI_DISP_EnableDirectMode(kVendorId, 0); printf("EnableDirectMode(0x%x) = 0x%x\n", kVendorId, st); NvU32 numDisplays = 0; NvAPI_DISP_EnumerateDirectModeDisplays(kVendorId, &numDisplays, nullptr, NV_ENUM_DIRECTMODE_DISPLAY_ENABLED); if (numDisplays == 0) { fprintf(stderr, "Presenter: no DirectMode display (Beyond not attached?)\n"); return false; } std::vector displays(numDisplays); NVFAIL(NvAPI_DISP_EnumerateDirectModeDisplays( kVendorId, &numDisplays, displays.data(), NV_ENUM_DIRECTMODE_DISPLAY_ENABLED)); im.display = displays[0]; printf("using displayId 0x%x\n", im.display.displayId); // Mode chosen by VALUE — the list reorders between runs (S2 bug #3a). NvU32 modeCount = 0; NVFAIL(NvAPI_D3D_DirectModeGetDisplayModes( &im.display, &modeCount, nullptr, NV_DIRECTMODE_GETMODES_FLAG_SUPPORTED)); std::vector modes(modeCount); for (auto& m : modes) { memset(&m, 0, sizeof(m)); m.version = NV_DIRECT_MODE_INFO_VER; } NVFAIL(NvAPI_D3D_DirectModeGetDisplayModes( &im.display, &modeCount, modes.data(), NV_DIRECTMODE_GETMODES_FLAG_SUPPORTED)); int chosen = -1; for (NvU32 i = 0; i < modeCount; i++) { auto& m = modes[i]; double hz = m.refresh.denominator ? (double)m.refresh.numerator / m.refresh.denominator : 0; printf("mode[%u]: %ux%u @ %.3f Hz format=%d\n", i, m.width, m.height, hz, (int)m.format); if (m.width == cfg_.width && m.height == cfg_.height && (int)m.format == cfg_.nvFormat && fabs(hz - cfg_.refreshHz) < 0.5) { chosen = (int)i; im.modeHz = hz; } } if (chosen < 0) { // Requested mode absent. The Beyond's firmware rate toggle (75/90) // restricts the offered list to ONE rate's mode set — field // (Win10/4090 rig): headset pinned at 90 Hz offered only // 3840x1920@90 while we asked the 75 Hz native default, and the old // refusal here read as a broken install. The link FORMAT is the part // that was validated by value (S2); the same format at the rate the // headset is actually offering is not a guess — adopt it loudly. for (NvU32 i = 0; i < modeCount; i++) { auto& m = modes[i]; if ((int)m.format != cfg_.nvFormat) continue; const double hz = m.refresh.denominator ? (double)m.refresh.numerator / m.refresh.denominator : 0; if (hz <= 0) continue; if (chosen < 0 || hz > im.modeHz) { chosen = (int)i; im.modeHz = hz; } } if (chosen >= 0) printf("Presenter: requested %ux%u@%.0f not offered (headset rate " "toggle set to the other refresh?) — using %ux%u@%.3f, same " "link format %d\n", cfg_.width, cfg_.height, cfg_.refreshHz, modes[chosen].width, modes[chosen].height, im.modeHz, cfg_.nvFormat); } if (chosen < 0) { fprintf(stderr, "Presenter: no mode matches %ux%u@%.0f format %d — refusing to " "guess (mode list above)\n", cfg_.width, cfg_.height, cfg_.refreshHz, cfg_.nvFormat); return false; } im.mode = modes[chosen]; printf("using mode %ux%u @ %.3f Hz format=%d\n", im.mode.width, im.mode.height, im.modeHz, (int)im.mode.format); if (cfg_.forceDsc) { // Default DSC negotiation garbles the link (S2 bug #2) — force the // device JSON's own parameters. im.mode.dscParams.dscMode = NV_DIRECT_MODE_DSC_MODE_FORCE_ENABLED; im.mode.dscParams.dscVersion = NV_DIRECT_MODE_DSC_VERSION_V11; im.mode.dscParams.sliceCount = NV_DIRECT_MODE_DSC_SLICE_COUNT_4; im.mode.dscParams.outputBPPx16 = 128; printf("forcing DSC 1.1, 4 slices, 8 bpp\n"); } NvPhysicalGpuHandle gpu; NVFAIL(NvAPI_SYS_GetPhysicalGpuFromDisplayId(im.display.displayId, &gpu)); ComPtr adapter; NVFAIL(NvAPI_D3D_GetIDXGIAdapter(gpu, adapter.GetAddressOf())); HRFAIL(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&im.dev))); D3D12_COMMAND_QUEUE_DESC qd{}; qd.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; // HIGH queue priority (no privileges needed, unlike GLOBAL_REALTIME): // the compositor must win GPU arbitration against the desktop apps it // is mirroring — M3 field finding: bursts of desktop GPU load stalled // presents to 100-200 ms and read as rubber-band tracking. qd.Priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH; HRESULT qhr = im.dev->CreateCommandQueue(&qd, IID_PPV_ARGS(&im.queue)); if (FAILED(qhr)) { // fall back rather than die on exotic drivers qd.Priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL; HRFAIL(im.dev->CreateCommandQueue(&qd, IID_PPV_ARGS(&im.queue))); fprintf(stderr, "Presenter: HIGH queue priority unavailable, using " "NORMAL\n"); } HRFAIL(im.dev->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&im.alloc))); HRFAIL(im.dev->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, im.alloc.Get(), nullptr, IID_PPV_ARGS(&im.list))); HRFAIL(im.list->Close()); HRFAIL(im.dev->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&im.fence))); im.fenceEvt = CreateEventA(nullptr, FALSE, FALSE, nullptr); return acquireDisplay(); } bool NvapiD3d12Presenter::acquireDisplay() { Impl& im = *impl_; // Acquire can transiently fail right after another process released the // display (driver settle) — retry with backoff. NvAPI_Status ast = NVAPI_ERROR; for (int attempt = 0; attempt < 10; attempt++) { ast = NvAPI_D3D_AcquireDirectModeDisplay(kVendorId, im.dev.Get(), &im.display); printf("AcquireDirectModeDisplay = 0x%x%s\n", ast, ast == NVAPI_HDCP_DISABLED ? " (HDCP disabled, continuing)" : ""); if (ast == NVAPI_OK || ast == NVAPI_HDCP_DISABLED) break; Sleep(1000); } if (ast != NVAPI_OK && ast != NVAPI_HDCP_DISABLED) return false; im.acquired = true; printf("[VRAM] acquire: after AcquireDisplay %llu MB\n", (unsigned long long)vramMB()); HANDLE shared[kNumBuffers]; for (int i = 0; i < kNumBuffers; i++) { NVFAIL(NvAPI_D3D_DirectModeCreateSurface(&im.display, &im.mode, &im.surf[i], &shared[i])); HRFAIL(im.dev->OpenSharedHandle(shared[i], IID_PPV_ARGS(&im.res[i]))); // CloseHandle the shared NT handle once OpenSharedHandle has taken its own // reference. NOT closing it leaks ~one scanout surface set per acquire: // the dangling handle kept the surface allocation alive, so releaseDisplay's // DestroySurface + res.Reset() could not return the VRAM — every SteamVR // release/reacquire grew the process by ~150 MB (M5 VRAM leak). CloseHandle(shared[i]); } auto rdesc = im.res[0]->GetDesc(); im.width = (UINT)rdesc.Width; im.height = rdesc.Height; printf("surface: %ux%u format=%d\n", im.width, im.height, (int)rdesc.Format); printf("[VRAM] acquire: after CreateSurface x%d %llu MB\n", kNumBuffers, (unsigned long long)vramMB()); // DM surfaces come back TYPELESS with a VARYING format across runs (S2 // bug #1/#3b) — explicit typed RTV, fail loudly on an unmapped format. DXGI_FORMAT rtvFmt; switch (rdesc.Format) { case DXGI_FORMAT_B8G8R8A8_TYPELESS: rtvFmt = DXGI_FORMAT_B8G8R8A8_UNORM; break; case DXGI_FORMAT_B8G8R8X8_TYPELESS: rtvFmt = DXGI_FORMAT_B8G8R8X8_UNORM; break; case DXGI_FORMAT_R8G8B8A8_TYPELESS: rtvFmt = DXGI_FORMAT_R8G8B8A8_UNORM; break; case DXGI_FORMAT_R10G10B10A2_TYPELESS: rtvFmt = DXGI_FORMAT_R10G10B10A2_UNORM; break; case DXGI_FORMAT_R16G16B16A16_TYPELESS: rtvFmt = DXGI_FORMAT_R16G16B16A16_FLOAT; break; default: fprintf(stderr, "Presenter: surface format %d has no typed RTV mapping — " "refusing (would scan out uninitialized VRAM)\n", (int)rdesc.Format); return false; } printf("rtv format: %d\n", (int)rtvFmt); im.rtvFormat = rtvFmt; D3D12_DESCRIPTOR_HEAP_DESC hd{}; hd.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV; hd.NumDescriptors = kNumBuffers; HRFAIL(im.dev->CreateDescriptorHeap(&hd, IID_PPV_ARGS(&im.rtvHeap))); UINT step = im.dev->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV); for (int i = 0; i < kNumBuffers; i++) { im.rtv[i] = im.rtvHeap->GetCPUDescriptorHandleForHeapStart(); im.rtv[i].ptr += (SIZE_T)i * step; D3D12_RENDER_TARGET_VIEW_DESC rd{}; rd.Format = rtvFmt; rd.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; im.dev->CreateRenderTargetView(im.res[i].Get(), &rd, im.rtv[i]); } NvAPI_Status st = NvAPI_D3D_DirectModeSetDisplayMode(&im.display, &im.mode); if (st == NVAPI_HDCP_DISABLED) st = NVAPI_OK; printf("SetDisplayMode = 0x%x\n", st); if (st != NVAPI_OK) return false; st = NvAPI_DISP_DirectModeDisplayControl(&im.display, NV_DM_DISPLAY_CONTROL_POWER_ON); printf("DisplayControl(POWER_ON) = 0x%x\n", st); st = NvAPI_DISP_DirectModeGetPresentWaitableObject( &im.display, reinterpret_cast(&im.waitEvt)); im.haveWaitable = (st == NVAPI_OK); printf("GetPresentWaitableObject = 0x%x\n", st); printf("[VRAM] acquire: DONE (modeset+POWER_ON) %llu MB\n", (unsigned long long)vramMB()); return true; } void NvapiD3d12Presenter::releaseDisplay() { Impl& im = *impl_; if (!im.acquired) return; printf("[VRAM] release: entry %llu MB\n", (unsigned long long)vramMB()); // Drain our GPU work before destroying scanout surfaces. im.queue->Signal(im.fence.Get(), ++im.fenceVal); if (im.fence->GetCompletedValue() < im.fenceVal) { im.fence->SetEventOnCompletion(im.fenceVal, im.fenceEvt); WaitForSingleObject(im.fenceEvt, 2000); } for (int i = 0; i < kNumBuffers; i++) { im.res[i].Reset(); if (im.surf[i]) { NvAPI_D3D_DirectModeDestroySurface(&im.display, im.surf[i]); im.surf[i] = {}; } } printf("[VRAM] release: after Reset+DestroySurf %llu MB\n", (unsigned long long)vramMB()); im.rtvHeap.Reset(); // The present waitable is a driver-owned notify handle — not ours to // close; the next acquire fetches a fresh one. im.waitEvt = nullptr; im.haveWaitable = false; NvAPI_Status st = NvAPI_D3D_ReleaseDirectModeDisplay(kVendorId, &im.display); printf("ReleaseDirectModeDisplay = 0x%x\n", st); printf("[VRAM] release: DONE (ReleaseDisplay) %llu MB\n", (unsigned long long)vramMB()); im.acquired = false; } // Single-buffer doze helpers (M5/ADR-0005). createScanoutBuffer rebuilds one // scanout surface + its RTV; freeScanoutBuffer drains the queue and drops it. // Both assume im.display is acquired, the rtvHeap exists, im.rtvFormat is set, // and im.rtv[i] already points at its descriptor slot (all true after the // first acquireDisplay) — so wake reuses the existing descriptor, just // rebinding the view to the freshly created resource. bool NvapiD3d12Presenter::createScanoutBuffer(int i) { Impl& im = *impl_; HANDLE shared = nullptr; NVFAIL(NvAPI_D3D_DirectModeCreateSurface(&im.display, &im.mode, &im.surf[i], &shared)); HRFAIL(im.dev->OpenSharedHandle(shared, IID_PPV_ARGS(&im.res[i]))); CloseHandle(shared); // see acquireDisplay: not closing leaks the surface set D3D12_RENDER_TARGET_VIEW_DESC rd{}; rd.Format = im.rtvFormat; rd.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; im.dev->CreateRenderTargetView(im.res[i].Get(), &rd, im.rtv[i]); return true; } void NvapiD3d12Presenter::freeScanoutBuffer(int i) { Impl& im = *impl_; // Drain: nothing in flight may still reference res[i] when we drop it. im.queue->Signal(im.fence.Get(), ++im.fenceVal); if (im.fence->GetCompletedValue() < im.fenceVal) { im.fence->SetEventOnCompletion(im.fenceVal, im.fenceEvt); WaitForSingleObject(im.fenceEvt, 2000); } im.res[i].Reset(); if (im.surf[i]) { NvAPI_D3D_DirectModeDestroySurface(&im.display, im.surf[i]); im.surf[i] = {}; } } void NvapiD3d12Presenter::run(double seconds, const DrawFn& draw) { Impl& im = *impl_; const double period = 1.0 / im.modeHz; // Pacing QoS (SCELTOUIN hitch triage): Win11 laptops power-throttle // (EcoQoS) and deprioritize threads of unfocused console apps — the // suspected cause of periodic 27-71 ms loop preemptions (frame= spiked // while gpu= and vsw= stayed small). Raise this thread and opt it out // of throttling; process-wide measures (timer resolution, process // EcoQoS opt-out) are the app's job. Both calls are harmless where the // problem never existed. SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); THREAD_POWER_THROTTLING_STATE tps{}; tps.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; tps.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED; tps.StateMask = 0; // throttling OFF for this thread SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &tps, sizeof(tps)); // Free-run detection (M1 doff hazard): per-frame WORK time — waitable wait // included, pacing sleep excluded. vsync alive => work ~= period; panels // dead => waitable stops blocking and work collapses to near zero. double workEma = period; int fastFrames = 0, slowFrames = 0; bool freeRunning = false; // Flap-quiet (M3 step 3): a doffed/unplugged display can oscillate in // and out of free-run; log the first transition of a burst, then stay // quiet while episodes restart within the window. Stats still count. double lastFreeRunExitT = -1e9; bool freeRunLogged = false; auto t0 = std::chrono::steady_clock::now(); uint64_t frames = 0, presentErrors = 0, waitTimeouts = 0, freeRunEvents = 0; std::chrono::steady_clock::time_point prevFrameStart{}; bool havePrevFrame = false; uint64_t consecTimeouts = 0; bool displayLost = false; // Late-latch pacing state (M4 step 3). lastVsync = the steady_clock // instant the present waitable last completed (a real vsync edge). // submitEma tracks TYPICAL draw-record + fence + present time (photon // estimate input); submitEnv is its fast-attack/slow-release envelope // (latch margin input): one contention balloon instantly widens the // margin toward a full period (= draw-after-vsync baseline) and it // decays back over ~1 s — a symmetric EMA reacted ~10 frames late and // then dove straight back into the spike (field: GPU-contention episode // read as every-other-frame rubber-banding). High-resolution timer for // the sub-millisecond sleep; falls back to coarse Sleep + a short spin // when unavailable (pre-1803 Windows). std::chrono::steady_clock::time_point lastVsync{}; bool haveVsyncTime = false; double submitEma = 0.003; double submitEnv = 0.003; HANDLE latchTimer = CreateWaitableTimerExW( nullptr, nullptr, CREATE_WAITABLE_TIMER_MANUAL_RESET | CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, TIMER_ALL_ACCESS); auto latchSleepUntil = [&](std::chrono::steady_clock::time_point target) { const double remSec = std::chrono::duration( target - std::chrono::steady_clock::now()).count(); if (remSec <= 0.0002) return; if (latchTimer) { LARGE_INTEGER due; due.QuadPart = -(LONGLONG)(remSec * 1e7); // relative, 100 ns units if (SetWaitableTimer(latchTimer, &due, 0, nullptr, nullptr, FALSE)) WaitForSingleObject(latchTimer, 50); } else { const double coarse = remSec - 0.0015; if (coarse > 0) Sleep((DWORD)(coarse * 1000.0)); while (std::chrono::steady_clock::now() < target) YieldProcessor(); } }; // Doze present (M5/ADR-0005): clear one buffer to true black and present it, // so the held scanout surface is bare black. Returns false if the present // failed (link trouble). Seeds doze and serves as the idle heartbeat. auto presentBlack = [&](uint64_t frameNo) -> bool { int bi = (int)(frameNo % kNumBuffers); im.alloc->Reset(); im.list->Reset(im.alloc.Get(), nullptr); D3D12_RESOURCE_BARRIER bb{}; bb.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; bb.Transition.pResource = im.res[bi].Get(); bb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; bb.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON; bb.Transition.StateAfter = D3D12_RESOURCE_STATE_RENDER_TARGET; im.list->ResourceBarrier(1, &bb); const float black[4] = {0.f, 0.f, 0.f, 1.f}; im.list->ClearRenderTargetView(im.rtv[bi], black, 0, nullptr); D3D12_RESOURCE_BARRIER bb2 = bb; bb2.Transition.StateBefore = D3D12_RESOURCE_STATE_RENDER_TARGET; bb2.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON; im.list->ResourceBarrier(1, &bb2); im.list->Close(); ID3D12CommandList* lists[] = {im.list.Get()}; im.queue->ExecuteCommandLists(1, lists); im.queue->Signal(im.fence.Get(), ++im.fenceVal); if (im.fence->GetCompletedValue() < im.fenceVal) { im.fence->SetEventOnCompletion(im.fenceVal, im.fenceEvt); WaitForSingleObject(im.fenceEvt, 1000); } NvAPI_Status st = NvAPI_D3D_DirectModePresent12( &im.display, im.surf[bi], NV_DIRECTMODE_PRESENT_FLAG_VSYNC, im.queue.Get(), im.list.Get()); return st == NVAPI_OK; }; while (!stopRequested_.load()) { double t = std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); if (seconds > 0 && t >= seconds) break; auto frameStart = std::chrono::steady_clock::now(); if (havePrevFrame) { const double intervalMs = std::chrono::duration(frameStart - prevFrameStart).count(); std::lock_guard lk(im.statsMu); if (intervalMs > im.stats.maxFrameIntervalMs) im.stats.maxFrameIntervalMs = intervalMs; } prevFrameStart = frameStart; havePrevFrame = true; int i = (int)(frames % kNumBuffers); // Hot-unplug check BEFORE touching the device: presenting to a removed // device can block inside the driver indefinitely (field finding: cable // pull mid-run froze the loop and pinned the last frame on replug). HRESULT removed = im.dev->GetDeviceRemovedReason(); if (removed != S_OK) { fprintf(stderr, "Presenter: D3D12 device removed (0x%lx) — display lost " "(cable pulled?), exiting present loop\n", removed); displayLost = true; break; } // SteamVR coexistence (M4): another runtime wants the headset — drop // the WHOLE DirectMode acquisition (the panels park below only stops // scanout but keeps the display held, which blocks SteamVR's // acquire). Park with zero GPU work until ownership returns, then // re-acquire and resume; pacing state is meaningless across the gap. // The D3D12 device persists, so the app's render resources survive. if (!hmdOwned_.load()) { printf("Presenter: releasing HMD display (other runtime session)\n"); releaseDisplay(); // A release preempts doze: the video drop clears the firmware's // doze-latch on its own (ADR-0005), so drop the host's matching // present-idle intent too — the reclaim must come up presenting // normally, not silently re-enter doze on a stale flag (full // reclaim-into-doze is the deferred ADR-0002 amendment). dozing_.store(false); { std::lock_guard lk(im.statsMu); im.stats.hmdReleased = true; im.stats.dozing = false; } bool deadline = false; while (!hmdOwned_.load() && !stopRequested_.load()) { Sleep(100); if (seconds > 0 && std::chrono::duration(std::chrono::steady_clock::now() - t0).count() >= seconds) { deadline = true; break; } if (im.dev->GetDeviceRemovedReason() != S_OK) { fprintf(stderr, "Presenter: device removed while HMD released — " "display lost, exiting present loop\n"); displayLost = true; break; } } if (displayLost || deadline || stopRequested_.load()) break; // Re-acquire: the other runtime may still be letting go of the // display — keep retrying as long as we're asked to own it. bool reacquired = false; while (hmdOwned_.load() && !stopRequested_.load()) { if (acquireDisplay()) { reacquired = true; break; } fprintf(stderr, "Presenter: re-acquire failed — retrying in 2 s\n"); Sleep(2000); } if (reacquired) { std::lock_guard lk(im.statsMu); im.stats.hmdReleased = false; } consecTimeouts = 0; workEma = period; fastFrames = slowFrames = 0; freeRunning = false; havePrevFrame = false; // released span is not a frame interval haveVsyncTime = false; // vsync reference is stale across the gap continue; // re-evaluate stop/ownership/panel state from the top } // Idle-policy power gate (M3 step 4): park here while panels are // requested off — POWER_OFF stops scanout and the firmware drops the // panels with the video signal. Hot-unplug still watched. On wake, // re-modeset (the S2 init order: SetDisplayMode, then POWER_ON) and // reset the pacing/free-run state, which is meaningless across a gap. if (!panelPowerOn_.load()) { NvAPI_Status ps = NvAPI_DISP_DirectModeDisplayControl( &im.display, NV_DM_DISPLAY_CONTROL_POWER_OFF); printf("Presenter: panels OFF (idle policy) = 0x%x\n", ps); { std::lock_guard lk(im.statsMu); im.stats.panelsOff = true; } bool deadline = false; while (!panelPowerOn_.load() && !stopRequested_.load()) { Sleep(50); if (seconds > 0 && std::chrono::duration(std::chrono::steady_clock::now() - t0).count() >= seconds) { deadline = true; // timed runs must not park past their end break; } if (im.dev->GetDeviceRemovedReason() != S_OK) { fprintf(stderr, "Presenter: device removed while panels off — " "display lost, exiting present loop\n"); displayLost = true; break; } } if (displayLost || deadline || stopRequested_.load()) break; ps = NvAPI_D3D_DirectModeSetDisplayMode(&im.display, &im.mode); if (ps == NVAPI_HDCP_DISABLED) ps = NVAPI_OK; NvAPI_Status pw = NvAPI_DISP_DirectModeDisplayControl( &im.display, NV_DM_DISPLAY_CONTROL_POWER_ON); printf("Presenter: panels ON (wake), SetDisplayMode=0x%x POWER_ON=0x%x\n", ps, pw); consecTimeouts = 0; workEma = period; fastFrames = slowFrames = 0; freeRunning = false; havePrevFrame = false; // parked span is not a frame interval haveVsyncTime = false; // vsync reference is stale across the park { std::lock_guard lk(im.statsMu); im.stats.panelsOff = false; im.stats.freeRunning = false; } } // Doze present-idle (M5/ADR-0005): warm shallow sleep. The firmware has // swept the panels dark on its own ('H' display-sleep), keeping the VXR // locked and video_enabled true. The host keeps the acquisition AND panel // power (no POWER_OFF, no release) but stops presenting at mode rate: // present one black frame so the held scanout is bare black, then idle to // a slow heartbeat. DirectMode scans out the last surface continuously, so // the link holds with no per-frame present (the no-present-free-run spike, // ADR-0005 open item); the ~1 Hz heartbeat re-present is cheap insurance // while that hold is validated on hardware. Wake = requestDoze(false) -> // resume mode-rate presenting, racing the firmware 'h' (~95 ms, no // DSC/link retrain). A release or POWER_OFF request taking precedence // while dozing falls through to those blocks next iteration (the firmware // clears its own doze-latch on the resulting video loss). if (dozing_.load() && hmdOwned_.load() && panelPowerOn_.load()) { // Single-buffer doze (M5/ADR-0005): seed buffer 0 as the held black // scanout, then free buffer 1 (~100 MB). For the whole doze span only // buffer 0 is presented (seed AND heartbeat both target index 0), so the // second surface is dead VRAM — recreated on wake below. Floor 275 -> ~175. const bool seeded = presentBlack(0); freeScanoutBuffer(1); printf("Presenter: dozing (present idled to ~1 Hz heartbeat; seed " "black %s; freed scanout buffer 1, %llu MB)\n", seeded ? "ok" : "FAILED", (unsigned long long)vramMB()); { std::lock_guard lk(im.statsMu); im.stats.dozing = true; im.stats.freeRunning = false; } bool deadline = false; auto lastBeat = std::chrono::steady_clock::now(); while (dozing_.load() && hmdOwned_.load() && panelPowerOn_.load() && !stopRequested_.load()) { Sleep(50); const auto now = std::chrono::steady_clock::now(); if (seconds > 0 && std::chrono::duration(now - t0).count() >= seconds) { deadline = true; // timed runs must not idle past their end break; } if (im.dev->GetDeviceRemovedReason() != S_OK) { fprintf(stderr, "Presenter: device removed while dozing — display lost " "(no-present link drop?), exiting present loop\n"); displayLost = true; break; } // Heartbeat: re-present black ~1 Hz. Insurance against any driver / // DirectMode expectation of periodic presents while the zero-present // hold is proven on hardware; raise the interval once it is. if (std::chrono::duration(now - lastBeat).count() >= 1.0) { if (!presentBlack(0)) // buffer 1 is freed during doze — only 0 lives fprintf(stderr, "Presenter: doze heartbeat present failed\n"); lastBeat = now; } } if (displayLost || deadline || stopRequested_.load()) break; // Left doze (woke, or a release / power-off request now takes // precedence): recreate the freed buffer 1 so the rest of the loop never // sees a missing scanout surface — the normal present alternates buffers, // and a preempting release/power-off path destroys both cleanly. Buffer 0 // stayed the held scanout the whole time, so the link never dropped. if (!createScanoutBuffer(1)) { fprintf(stderr, "Presenter: wake — failed to recreate scanout buffer 1, " "exiting present loop\n"); displayLost = true; // can't safely present with a buffer missing break; } // Pacing, free-run, and vsync references are all stale across the idle span. consecTimeouts = 0; workEma = period; fastFrames = slowFrames = 0; freeRunning = false; havePrevFrame = false; haveVsyncTime = false; { std::lock_guard lk(im.statsMu); im.stats.dozing = false; im.stats.freeRunning = false; } printf("Presenter: doze end — resuming mode-rate present\n"); continue; // re-evaluate stop/ownership/panel/doze state from the top } // Vsync reference for this frame: real edges keep arriving under GPU // contention (frames just flip periods late), so the grid stays valid // for a good while — gate on the free-run flag (doffed waitable = no // real edges) plus a generous 1 s staleness cap. The old 2-period cap // dropped to the fixed-span fallback mid-contention and the // alternation between the two prediction modes read as judder. const auto nowTp = std::chrono::steady_clock::now(); const bool vsyncRefOk = haveVsyncTime && !freeRunning && std::chrono::duration(nowTp - lastVsync).count() < 1.0; // Late-latch (M4 step 3): sleep until just-before-flip so the pose // sampled in draw() is as fresh as the submit margin allows. Sleep // time is excluded from the free-run work metric below (a dead // waitable + this sleep would otherwise read as healthy pacing). double latchSleepSec = 0.0; double latchOverMs = 0.0; if (vsyncRefOk && lateLatch_.load()) { double margin = 1.5 * submitEnv + 0.002; if (margin < 0.004) margin = 0.004; if (margin > period) margin = period; const auto target = lastVsync + std::chrono::duration_cast< std::chrono::steady_clock::duration>( std::chrono::duration(period - margin)); latchSleepUntil(target); const auto wake = std::chrono::steady_clock::now(); latchSleepSec = std::chrono::duration(wake - nowTp).count(); // Oversleep past the target = OS timer coalescing / power-save // scheduling eating the submit margin (lat= watermark). latchOverMs = std::chrono::duration(wake - target).count(); if (latchOverMs < 0.0) latchOverMs = 0.0; } const auto submitStart = std::chrono::steady_clock::now(); // Photon estimate AFTER the latch sleep, against the first flip this // frame can actually reach given typical submit time — under // contention frames land 2-3 periods past lastVsync, and a hardwired // next-flip (k=1) estimate under-predicted exactly the late frames // (field: alternating pose error = rapid rubber-banding). Healthy // loop: k=1, identical to before. Photons mid-scanout, +period/2. int64_t photonNs = 0; if (vsyncRefOk) { const double sinceVsync = std::chrono::duration( submitStart - lastVsync).count(); double k = std::ceil((sinceVsync + submitEma) / period); if (k < 1.0) k = 1.0; if (k > 4.0) k = 4.0; // AHRS clamps the span at 50 ms anyway const auto photon = lastVsync + std::chrono::duration_cast< std::chrono::steady_clock::duration>( std::chrono::duration((k + 0.5) * period)); photonNs = std::chrono::duration_cast( photon.time_since_epoch()).count(); } im.alloc->Reset(); im.list->Reset(im.alloc.Get(), nullptr); D3D12_RESOURCE_BARRIER b{}; b.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; b.Transition.pResource = im.res[i].Get(); b.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; b.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON; b.Transition.StateAfter = D3D12_RESOURCE_STATE_RENDER_TARGET; im.list->ResourceBarrier(1, &b); FrameContext fc{}; fc.cmdList = im.list.Get(); fc.rtv = &im.rtv[i]; fc.width = im.width; fc.height = im.height; fc.rtvFormat = (uint32_t)im.rtvFormat; fc.timeSec = t; fc.frameIndex = frames; fc.photonTimeNs = photonNs; const auto d0 = std::chrono::steady_clock::now(); draw(fc); const double drawCpuMs = std::chrono::duration( std::chrono::steady_clock::now() - d0).count(); D3D12_RESOURCE_BARRIER b2 = b; b2.Transition.StateBefore = D3D12_RESOURCE_STATE_RENDER_TARGET; b2.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON; im.list->ResourceBarrier(1, &b2); im.list->Close(); ID3D12CommandList* lists[] = {im.list.Get()}; im.queue->ExecuteCommandLists(1, lists); im.queue->Signal(im.fence.Get(), ++im.fenceVal); if (im.fence->GetCompletedValue() < im.fenceVal) { const auto fw0 = std::chrono::steady_clock::now(); im.fence->SetEventOnCompletion(im.fenceVal, im.fenceEvt); WaitForSingleObject(im.fenceEvt, 1000); const double fwMs = std::chrono::duration( std::chrono::steady_clock::now() - fw0).count(); std::lock_guard lk(im.statsMu); if (fwMs > im.stats.maxFenceWaitMs) im.stats.maxFenceWaitMs = fwMs; } const auto p0 = std::chrono::steady_clock::now(); NvAPI_Status st = NvAPI_D3D_DirectModePresent12( &im.display, im.surf[i], NV_DIRECTMODE_PRESENT_FLAG_VSYNC, im.queue.Get(), im.list.Get()); const double presentCpuMs = std::chrono::duration( std::chrono::steady_clock::now() - p0).count(); if (st != NVAPI_OK) { presentErrors++; fprintf(stderr, "Present12 = 0x%x at frame %llu\n", st, (unsigned long long)frames); if (presentErrors > 10) break; } const double submitSec = std::chrono::duration( std::chrono::steady_clock::now() - submitStart).count(); submitEma += 0.1 * (submitSec - submitEma); // Margin envelope: attack instantly, release ~1%/frame (~1 s back to // typical at 75 Hz) — the latch must never chase a contention spike // from below. submitEnv = std::fmax(submitSec, submitEnv * 0.99); if (submitEnv < 0.001) submitEnv = 0.001; const auto vw0 = std::chrono::steady_clock::now(); const bool vsyncTimeout = im.haveWaitable && WaitForSingleObject(im.waitEvt, 1000) == WAIT_TIMEOUT; double vswWaitMs = -1.0; // <0 = no waitable this frame if (im.haveWaitable) { const auto vw1 = std::chrono::steady_clock::now(); vswWaitMs = std::chrono::duration(vw1 - vw0).count(); if (!vsyncTimeout) { lastVsync = vw1; // a real (or believed-real) vsync edge haveVsyncTime = true; } std::lock_guard lk(im.statsMu); if (vswWaitMs > im.stats.maxVsyncWaitMs) im.stats.maxVsyncWaitMs = vswWaitMs; } if (vsyncTimeout) { waitTimeouts++; // Doff only dims the panels and keeps the waitable signaling; a string // of 1 s timeouts means the display itself is gone. if (++consecTimeouts >= 5) { fprintf(stderr, "Presenter: present waitable dead for %llus — display lost, " "exiting present loop\n", (unsigned long long)consecTimeouts); displayLost = true; break; } } else { consecTimeouts = 0; } // Free-run detection. With a waitable, classify by the VSYNC WAIT // itself (M4 step 3 field fix): a real vsync edge blocks the wait — // ~period-submit with the latch off, ~latch-margin with it on, but // always >= ~0.5 ms in any healthy phase — while a dead waitable // (doffed headset) returns instantly every single frame. The old // work-time rule false-fired under late-latch: the latch sleep moved // the loop's idle out of the vsync wait, work collapsed below // period/2, and the spurious FREE-RUN flapping (plus its self-pace // sleep stacked on real vsync waits) caused intermittent missed flips // — the eyes-in "hitching" report. Work-time rule kept as the // no-waitable fallback. work still excludes the latch sleep. double work = std::chrono::duration( std::chrono::steady_clock::now() - frameStart).count() - latchSleepSec; workEma += 0.1 * (work - workEma); if (vswWaitMs >= 0.0) { if (vswWaitMs < 0.5) fastFrames++; else fastFrames = 0; if (vswWaitMs >= 2.0) slowFrames++; else slowFrames = 0; } else { if (workEma < 0.5 * period) { fastFrames++; slowFrames = 0; } else if (work > 0.8 * period) { slowFrames++; fastFrames = 0; } } if (!freeRunning && fastFrames >= 30) { freeRunning = true; freeRunEvents++; freeRunLogged = t - lastFreeRunExitT > 10.0; // new burst, not a flap if (freeRunLogged) fprintf(stderr, "WARNING: present loop free-running (vsync lost — headset " "doffed?), self-pacing at %.0f Hz\n", im.modeHz); } else if (freeRunning && slowFrames >= 10) { freeRunning = false; workEma = period; lastFreeRunExitT = t; if (freeRunLogged) fprintf(stderr, "free-run over: vsync pacing restored\n"); } if (freeRunning && work < period) { // Keep presenting (recovers automatically when panels return) but do // not let the loop spin at 5000+ fps. Sleep((DWORD)((period - work) * 1000.0)); } frames++; { std::lock_guard lk(im.statsMu); im.stats.frames = frames; im.stats.presentErrors = presentErrors; im.stats.waitTimeouts = waitTimeouts; im.stats.avgFps = t > 0 ? frames / t : 0; im.stats.freeRunning = freeRunning; im.stats.freeRunEvents = freeRunEvents; if (latchOverMs > im.stats.maxLatchOverMs) im.stats.maxLatchOverMs = latchOverMs; if (drawCpuMs > im.stats.maxDrawCpuMs) im.stats.maxDrawCpuMs = drawCpuMs; if (presentCpuMs > im.stats.maxPresentCpuMs) im.stats.maxPresentCpuMs = presentCpuMs; } } if (latchTimer) CloseHandle(latchTimer); { std::lock_guard lk(im.statsMu); im.stats.displayLost = displayLost; } double total = std::chrono::duration(std::chrono::steady_clock::now() - t0).count(); printf("PRESENT RESULT: %llu frames in %.1fs = %.2f fps avg, presentErrors " "%llu, waitTimeouts %llu, freeRunEvents %llu\n", (unsigned long long)frames, total, total > 0 ? frames / total : 0, (unsigned long long)presentErrors, (unsigned long long)waitTimeouts, (unsigned long long)freeRunEvents); if (cfg_.powerOffOnExit) NvAPI_DISP_DirectModeDisplayControl(&im.display, NV_DM_DISPLAY_CONTROL_POWER_OFF); } double NvapiD3d12Presenter::modeHz() const { return impl_->modeHz; } bool NvapiD3d12Presenter::panelsOffNow() const { std::lock_guard lk(impl_->statsMu); return impl_->stats.panelsOff; } bool NvapiD3d12Presenter::hmdReleasedNow() const { std::lock_guard lk(impl_->statsMu); return impl_->stats.hmdReleased; } bool NvapiD3d12Presenter::dozingNow() const { std::lock_guard lk(impl_->statsMu); return impl_->stats.dozing; } PresentStats NvapiD3d12Presenter::stats() const { const int clk = currentGpuClockMHz(); // NvAPI call — outside the lock std::lock_guard lk(impl_->statsMu); PresentStats out = impl_->stats; out.gpuCoreClockMHz = clk; impl_->stats.maxFrameIntervalMs = 0.0; // per-read watermark window impl_->stats.maxFenceWaitMs = 0.0; impl_->stats.maxVsyncWaitMs = 0.0; impl_->stats.maxLatchOverMs = 0.0; impl_->stats.maxDrawCpuMs = 0.0; impl_->stats.maxPresentCpuMs = 0.0; return out; } } // namespace sauna