#include "capture/duplication_source.h" #include #include #include #include #include #include #include #include #include #include #include #include using Microsoft::WRL::ComPtr; namespace sauna { namespace { constexpr int kBuffers = 3; constexpr uint32_t kReacquireSleepMs = 250; // Cursor composite: one alpha-blended quad into mip0, position in NDC via // a tiny dynamic cbuffer. Monochrome/masked shapes are converted to BGRA on // the CPU when the shape arrives (rare); XOR-with-screen regions are // approximated as opaque white — good enough for an I-beam to be visible. const char kCursorShader[] = R"( cbuffer C : register(b0) { float4 rect; }; // NDC x0, y0(top), x1, y1(bottom) Texture2D tex : register(t0); SamplerState smp : register(s0); struct VSOut { float4 pos : SV_Position; float2 uv : TEXCOORD0; }; VSOut vsmain(uint id : SV_VertexID) { float2 c = float2(id & 1, id >> 1); VSOut o; o.pos = float4(lerp(rect.x, rect.z, c.x), lerp(rect.y, rect.w, c.y), 0, 1); o.uv = c; return o; } float4 psmain(VSOut i) : SV_Target { return tex.Sample(smp, i.uv); } )"; // HDR/format convert pass (M4 step 5): when the duplicated desktop is not // BGRA8 (advanced-color outputs duplicate as FP16 scRGB; 10-bit SDR as // R10G10B10A2), the frame is copied into a desktop-format intermediate and // this fullscreen pass writes the BGRA8 ring instead of a direct copy. // scRGB is LINEAR with 1.0 = 80 nits: divide by the monitor's SDR white // point (so SDR content lands at 1.0 exactly like an SDR desktop), clamp — // HDR highlights above SDR white clip in v1 — and sRGB-encode. 10-bit SDR // is already gamma-encoded: passthrough (p.y = 0). Cursor + mips run on // the converted BGRA8 exactly as on the SDR path. const char kConvertShader[] = R"( cbuffer C : register(b0) { float4 p; }; // x = 1/sdrWhite (scRGB), y = encode Texture2D tex : register(t0); SamplerState smp : register(s0); struct VSOut { float4 pos : SV_Position; float2 uv : TEXCOORD0; }; VSOut vsmain(uint id : SV_VertexID) { VSOut o; float2 uv = float2((id << 1) & 2, id & 2); o.pos = float4(uv * float2(2, -2) + float2(-1, 1), 0, 1); o.uv = uv; return o; } float enc(float c) { return c <= 0.0031308 ? 12.92 * c : 1.055 * pow(c, 1.0 / 2.4) - 0.055; } float4 psmain(VSOut i) : SV_Target { float3 c = tex.Sample(smp, i.uv).rgb; if (p.y > 0.5) { c = saturate(c * p.x); c = float3(enc(c.r), enc(c.g), enc(c.b)); } return float4(c, 1.0); } )"; bool compileSrc(const char* src, size_t len, const char* entry, const char* target, ComPtr* out) { ComPtr err; HRESULT hr = D3DCompile(src, len, nullptr, nullptr, nullptr, entry, target, 0, 0, &*out, &err); if (FAILED(hr)) { fprintf(stderr, "capture shader %s: %s\n", entry, err ? (const char*)err->GetBufferPointer() : "compile failed"); return false; } return true; } bool compile(const char* entry, const char* target, ComPtr* out) { return compileSrc(kCursorShader, sizeof(kCursorShader) - 1, entry, target, out); } // SDR white level of the monitor (scRGB units: SDRWhiteLevel/1000; 1.0 = // 80 nits). Windows' "SDR content brightness" slider sets it — SDR content // on an HDR desktop is composited at this level. 2.5 (200 nits) fallback. float querySdrWhiteScRGB(HMONITOR mon) { constexpr float kFallback = 2.5f; MONITORINFOEXW mi{}; mi.cbSize = sizeof(mi); if (!mon || !GetMonitorInfoW(mon, &mi)) return kFallback; UINT32 nPath = 0, nMode = 0; if (GetDisplayConfigBufferSizes(QDC_ONLY_ACTIVE_PATHS, &nPath, &nMode) != ERROR_SUCCESS) return kFallback; std::vector paths(nPath); std::vector modes(nMode); if (QueryDisplayConfig(QDC_ONLY_ACTIVE_PATHS, &nPath, paths.data(), &nMode, modes.data(), nullptr) != ERROR_SUCCESS) return kFallback; for (UINT32 i = 0; i < nPath; i++) { DISPLAYCONFIG_SOURCE_DEVICE_NAME sn{}; sn.header.type = DISPLAYCONFIG_DEVICE_INFO_GET_SOURCE_NAME; sn.header.size = sizeof(sn); sn.header.adapterId = paths[i].sourceInfo.adapterId; sn.header.id = paths[i].sourceInfo.id; if (DisplayConfigGetDeviceInfo(&sn.header) != ERROR_SUCCESS) continue; if (wcscmp(sn.viewGdiDeviceName, mi.szDevice) != 0) continue; DISPLAYCONFIG_SDR_WHITE_LEVEL wl{}; wl.header.type = DISPLAYCONFIG_DEVICE_INFO_GET_SDR_WHITE_LEVEL; wl.header.size = sizeof(wl); wl.header.adapterId = paths[i].targetInfo.adapterId; wl.header.id = paths[i].targetInfo.id; if (DisplayConfigGetDeviceInfo(&wl.header) != ERROR_SUCCESS) break; if (wl.SDRWhiteLevel >= 1000) // sanity: >= 80 nits return wl.SDRWhiteLevel / 1000.0f; break; } return kFallback; } uint32_t fullMipCount(uint32_t w, uint32_t h) { uint32_t m = 1, s = w > h ? w : h; while (s > 1) { s >>= 1; m++; } return m; } } // namespace struct DuplicationSource::Impl { int outputIndex = 0; ID3D12Device* dev12 = nullptr; // not owned ComPtr dev11; ComPtr dev11_5; ComPtr ctx; ComPtr ctx4; ComPtr adapter; // for output re-resolve on reacquire ComPtr output1; ComPtr dup; // Shared ring: D3D11 writes (copy + cursor + mips), D3D12 samples. ComPtr tex11[kBuffers]; ComPtr rtv11[kBuffers]; ComPtr srv11[kBuffers]; // full chain, for mips ComPtr tex12[kBuffers]; uint32_t w = 0, h = 0, mips = 1; // HDR/format convert state (M4 step 5). deskFmt follows the duplicated // desktop; anything but BGRA8 routes through hdrTex + the convert pass. DXGI_FORMAT deskFmt = DXGI_FORMAT_B8G8R8A8_UNORM; bool convertOn = false; bool convertEncode = false; // FP16 scRGB -> divide + sRGB-encode float sdrWhiteScRGB = 2.5f; ComPtr hdrTex; // copy dest in desktop format ComPtr hdrSrv; ComPtr tmVs; ComPtr tmPs; ComPtr tmCb; // Producer-side completion fence (CPU-waited before publish — keeps the // consumer free of cross-API sync; fence pacing proper is M4). ComPtr fence; uint64_t fenceValue = 0; HANDLE fenceEvent = nullptr; // Cursor pipeline + state (worker thread only). ComPtr curVs; ComPtr curPs; ComPtr curCb; ComPtr curBlend; ComPtr curSamp; ComPtr curTex; ComPtr curSrv; uint32_t curW = 0, curH = 0; bool curVisible = false; POINT curPos{}; int curHotX = 0, curHotY = 0; // shape hotspot (GetCursorInfo points at it) RECT outputRect{}; // this output's desktop coords (physical px) std::vector shapeRaw; // Cursor overlay policy: -1 auto (= on; the "advanced-color desktops // bake a software cursor into the duplicated image" field report // turned out to be a Parsec remote session — auto carries no format // heuristic until a machine actually demonstrates an in-image // cursor), 0 never (set on such a machine — doubling otherwise), // 1 always. std::atomic cursorOverlay{-1}; std::atomic overlayCursor{true}; // resolved at acquire + live set // Published state (mutex: worker writes, render thread reads). mutable std::mutex mu; int published = -1; uint32_t generation = 0; uint64_t frames = 0, reacquires = 0; bool capturing = false; double minPeriodMs = 0.0; // 0 = unthrottled (process every frame) std::thread worker; std::atomic stopFlag{false}; std::atomic pauseFlag{false}; bool warnedRotation = false; bool announcedLoss = false; bool createSharedTextures(); bool acquireDuplication(bool initial); void updateCursorShape(uint32_t size); void drawCursor(int idx); void drawConvert(int idx); void run(); }; bool DuplicationSource::Impl::createSharedTextures() { for (int i = 0; i < kBuffers; i++) { tex11[i].Reset(); rtv11[i].Reset(); srv11[i].Reset(); tex12[i].Reset(); } // Non-BGRA8 desktop: intermediate in the desktop's own format, convert // pass writes the ring. The shared ring stays BGRA8 — downstream (D3D12 // SRVs, warp) never sees the desktop format. hdrTex.Reset(); hdrSrv.Reset(); convertOn = deskFmt != DXGI_FORMAT_B8G8R8A8_UNORM; convertEncode = deskFmt == DXGI_FORMAT_R16G16B16A16_FLOAT; // Cursor overlay resolve (see policy note at the member): auto = on. overlayCursor = cursorOverlay.load() != 0; if (convertOn) { D3D11_TEXTURE2D_DESC hd{}; hd.Width = w; hd.Height = h; hd.MipLevels = 1; hd.ArraySize = 1; hd.Format = deskFmt; hd.SampleDesc.Count = 1; hd.Usage = D3D11_USAGE_DEFAULT; hd.BindFlags = D3D11_BIND_SHADER_RESOURCE; if (FAILED(dev11->CreateTexture2D(&hd, nullptr, &hdrTex)) || FAILED(dev11->CreateShaderResourceView(hdrTex.Get(), nullptr, &hdrSrv))) { fprintf(stderr, "capture: convert intermediate (fmt %d) failed\n", (int)deskFmt); return false; } } mips = fullMipCount(w, h); D3D11_TEXTURE2D_DESC td{}; td.Width = w; td.Height = h; td.MipLevels = mips; td.ArraySize = 1; td.Format = DXGI_FORMAT_B8G8R8A8_UNORM; td.SampleDesc.Count = 1; td.Usage = D3D11_USAGE_DEFAULT; td.BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE; td.MiscFlags = D3D11_RESOURCE_MISC_GENERATE_MIPS | D3D11_RESOURCE_MISC_SHARED | D3D11_RESOURCE_MISC_SHARED_NTHANDLE; if (FAILED(dev11->CreateTexture2D(&td, nullptr, &tex11[0]))) { // Some stacks refuse mipped shared textures — fall back to mip 1 and // accept minification shimmer rather than failing capture entirely. fprintf(stderr, "capture: mipped shared texture refused — falling back " "to no mips (expect shimmer)\n"); mips = 1; td.MipLevels = 1; td.MiscFlags = D3D11_RESOURCE_MISC_SHARED | D3D11_RESOURCE_MISC_SHARED_NTHANDLE; if (FAILED(dev11->CreateTexture2D(&td, nullptr, &tex11[0]))) { fprintf(stderr, "capture: shared texture creation failed\n"); return false; } } for (int i = 0; i < kBuffers; i++) { if (!tex11[i] && FAILED(dev11->CreateTexture2D(&td, nullptr, &tex11[i]))) return false; if (FAILED(dev11->CreateRenderTargetView(tex11[i].Get(), nullptr, &rtv11[i]))) return false; if (mips > 1 && FAILED(dev11->CreateShaderResourceView(tex11[i].Get(), nullptr, &srv11[i]))) return false; ComPtr dxgiRes; if (FAILED(tex11[i].As(&dxgiRes))) return false; HANDLE sh = nullptr; if (FAILED(dxgiRes->CreateSharedHandle( nullptr, DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE, nullptr, &sh))) return false; HRESULT hr = dev12->OpenSharedHandle(sh, IID_PPV_ARGS(&tex12[i])); CloseHandle(sh); if (FAILED(hr)) { fprintf(stderr, "capture: D3D12 OpenSharedHandle failed (0x%08lx)\n", (unsigned long)hr); return false; } } return true; } bool DuplicationSource::Impl::acquireDuplication(bool initial) { dup.Reset(); // Re-resolve the output every attempt: after a monitor unplug/replug the // old IDXGIOutput is dead even when the same index comes back. ComPtr output; if (FAILED(adapter->EnumOutputs(outputIndex, &output)) || FAILED(output.As(&output1))) { if (!announcedLoss) { fprintf(stderr, "capture: output %d gone — waiting for replug\n", outputIndex); announcedLoss = true; } return false; } // DuplicateOutput1 first (M4 step 5 field fix): plain DuplicateOutput // returns DXGI_ERROR_UNSUPPORTED outright on HDR/10-bit desktops — the // duplicated format must be negotiated. List every format the convert // pass handles; legacy DuplicateOutput stays as the pre-Win10-1703 // fallback. { DXGI_OUTPUT_DESC od{}; if (SUCCEEDED(output1->GetDesc(&od))) outputRect = od.DesktopCoordinates; // cursor-authority mapping } HRESULT hr = E_NOINTERFACE; ComPtr output5; if (SUCCEEDED(output1.As(&output5))) { const DXGI_FORMAT fmts[] = {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM}; hr = output5->DuplicateOutput1(dev11.Get(), 0, 3, fmts, &dup); } if (FAILED(hr)) hr = output1->DuplicateOutput(dev11.Get(), &dup); if (FAILED(hr)) { // E_ACCESSDENIED: secure desktop (UAC). NOT_CURRENTLY_AVAILABLE: too // many duplications / transient. Both clear on their own — keep trying. // UNSUPPORTED that persists even via DuplicateOutput1: the output is // likely owned by a different GPU (hybrid graphics) — cross-adapter // capture is not in v1. if (!announcedLoss) { fprintf(stderr, "capture: DuplicateOutput failed (0x%08lx) — retrying" "%s\n", (unsigned long)hr, hr == DXGI_ERROR_UNSUPPORTED ? " (persistent UNSUPPORTED = output on another GPU? " "hybrid graphics is not supported in v1)" : ""); announcedLoss = true; } return false; } announcedLoss = false; DXGI_OUTDUPL_DESC dd{}; dup->GetDesc(&dd); if (dd.Rotation != DXGI_MODE_ROTATION_IDENTITY && dd.Rotation != DXGI_MODE_ROTATION_UNSPECIFIED && !warnedRotation) { fprintf(stderr, "capture: rotated output (%d) unsupported in v1 — image " "will be unrotated\n", (int)dd.Rotation); warnedRotation = true; } if (initial || dd.ModeDesc.Width != w || dd.ModeDesc.Height != h || dd.ModeDesc.Format != deskFmt) { // Mode-flap forensics (field: failed HDR enable attempts bounce the // desktop through formats and can leave the OS cursor state glitched // — make every transition visible in the console). if (!initial) printf("capture: output %d mode change %ux%u fmt %d -> %ux%u fmt %d " "— ring rebuilt\n", outputIndex, w, h, (int)deskFmt, dd.ModeDesc.Width, dd.ModeDesc.Height, (int)dd.ModeDesc.Format); w = dd.ModeDesc.Width; h = dd.ModeDesc.Height; deskFmt = dd.ModeDesc.Format; if (!createSharedTextures()) return false; if (convertOn) { if (convertEncode) { DXGI_OUTPUT_DESC odNow{}; output1->GetDesc(&odNow); sdrWhiteScRGB = querySdrWhiteScRGB(odNow.Monitor); printf("capture: output %d HDR (scRGB FP16) — tone-mapping to sRGB, " "SDR white %.0f nits\n", outputIndex, sdrWhiteScRGB * 80.0f); } else { printf("capture: output %d desktop format %d — passthrough convert " "(10-bit SDR assumed gamma-encoded)\n", outputIndex, (int)deskFmt); } } std::lock_guard lk(mu); published = -1; // Bump generation on EVERY (re)create — including initial — so the consumer // rebuilds its SRVs against the new textures. Needed for the doze stop()/ // resume() lever: resume() re-inits via start() (initial=true) with brand // new texture pointers, and a stale consumer SRV would dangle. First launch // bumps 0->1, which the consumer's sentinel gen already forces a rebuild on. generation++; } return true; } // Convert pass: desktop-format intermediate -> BGRA8 ring mip0. Runs // before the cursor blend, full viewport, no blend state. void DuplicationSource::Impl::drawConvert(int idx) { if (!tmVs || !tmPs) return; D3D11_MAPPED_SUBRESOURCE map{}; if (SUCCEEDED(ctx->Map(tmCb.Get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &map))) { float p[4] = {1.0f / sdrWhiteScRGB, convertEncode ? 1.0f : 0.0f, 0, 0}; memcpy(map.pData, p, sizeof(p)); ctx->Unmap(tmCb.Get(), 0); } ID3D11RenderTargetView* rt = rtv11[idx].Get(); ctx->OMSetRenderTargets(1, &rt, nullptr); ctx->OMSetBlendState(nullptr, nullptr, 0xFFFFFFFFu); D3D11_VIEWPORT vp{0, 0, (float)w, (float)h, 0, 1}; ctx->RSSetViewports(1, &vp); ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); ctx->VSSetShader(tmVs.Get(), nullptr, 0); ctx->PSSetShader(tmPs.Get(), nullptr, 0); ID3D11Buffer* cb = tmCb.Get(); ctx->PSSetConstantBuffers(0, 1, &cb); ID3D11ShaderResourceView* sv = hdrSrv.Get(); ctx->PSSetShaderResources(0, 1, &sv); ID3D11SamplerState* sp = curSamp.Get(); ctx->PSSetSamplers(0, 1, &sp); ctx->Draw(3, 0); ID3D11ShaderResourceView* nullSrv = nullptr; // ring tex rebinds as RTV ctx->PSSetShaderResources(0, 1, &nullSrv); ID3D11RenderTargetView* nullRtv = nullptr; ctx->OMSetRenderTargets(1, &nullRtv, nullptr); } void DuplicationSource::Impl::updateCursorShape(uint32_t size) { shapeRaw.resize(size); DXGI_OUTDUPL_POINTER_SHAPE_INFO si{}; UINT needed = 0; if (FAILED(dup->GetFramePointerShape((UINT)shapeRaw.size(), shapeRaw.data(), &needed, &si))) return; uint32_t ow = si.Width, oh = si.Height; curHotX = (int)si.HotSpot.x; curHotY = (int)si.HotSpot.y; std::vector bgra; if (si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MONOCHROME) { oh = si.Height / 2; // AND mask rows, then XOR mask rows; 1 bpp MSB-first bgra.resize((size_t)ow * oh); for (uint32_t y = 0; y < oh; y++) for (uint32_t x = 0; x < ow; x++) { const uint8_t andB = shapeRaw[(size_t)y * si.Pitch + x / 8]; const uint8_t xorB = shapeRaw[(size_t)(y + oh) * si.Pitch + x / 8]; const uint8_t bit = 0x80 >> (x % 8); const bool a = andB & bit, xr = xorB & bit; uint32_t v; if (a) v = xr ? 0xFFFFFFFFu : 0x00000000u; // XOR-invert approx as white else v = xr ? 0xFFFFFFFFu : 0xFF000000u; bgra[(size_t)y * ow + x] = v; } } else { // COLOR or MASKED_COLOR: 32bpp BGRA rows bgra.resize((size_t)ow * oh); for (uint32_t y = 0; y < oh; y++) for (uint32_t x = 0; x < ow; x++) { uint32_t v; memcpy(&v, &shapeRaw[(size_t)y * si.Pitch + x * 4], 4); if (si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR) { // Mask byte semantics: 0 = replace with RGB; 0xFF = XOR RGB // into the screen. XOR-with-zero leaves the screen untouched — // i.e. TRANSPARENT. The old "force everything opaque" turned // exactly those pixels into an opaque black box around the // I-beam (field, Win10 rig: text cursors arrive MASKED_COLOR // there, not MONOCHROME). Nonzero XOR pixels approximate as // opaque color (same compromise as the monochrome path). const uint32_t rgb = v & 0x00FFFFFFu; const bool xorPix = (v & 0xFF000000u) != 0; v = (xorPix && rgb == 0) ? 0x00000000u : (0xFF000000u | rgb); } bgra[(size_t)y * ow + x] = v; } } if (!curTex || ow != curW || oh != curH) { curSrv.Reset(); curTex.Reset(); D3D11_TEXTURE2D_DESC td{}; td.Width = ow; td.Height = oh; td.MipLevels = 1; td.ArraySize = 1; td.Format = DXGI_FORMAT_B8G8R8A8_UNORM; td.SampleDesc.Count = 1; td.Usage = D3D11_USAGE_DEFAULT; td.BindFlags = D3D11_BIND_SHADER_RESOURCE; if (FAILED(dev11->CreateTexture2D(&td, nullptr, &curTex))) return; if (FAILED(dev11->CreateShaderResourceView(curTex.Get(), nullptr, &curSrv))) return; curW = ow; curH = oh; } ctx->UpdateSubresource(curTex.Get(), 0, nullptr, bgra.data(), ow * 4, 0); } void DuplicationSource::Impl::drawCursor(int idx) { const float x0 = (float)curPos.x, y0 = (float)curPos.y; const float x1 = x0 + curW, y1 = y0 + curH; const float rect[4] = {x0 / w * 2 - 1, 1 - y0 / h * 2, x1 / w * 2 - 1, 1 - y1 / h * 2}; D3D11_MAPPED_SUBRESOURCE m{}; if (SUCCEEDED(ctx->Map(curCb.Get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &m))) { memcpy(m.pData, rect, sizeof(rect)); ctx->Unmap(curCb.Get(), 0); } ID3D11RenderTargetView* rtv = rtv11[idx].Get(); ctx->OMSetRenderTargets(1, &rtv, nullptr); D3D11_VIEWPORT vp{0, 0, (float)w, (float)h, 0, 1}; ctx->RSSetViewports(1, &vp); ctx->IASetInputLayout(nullptr); ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); ctx->VSSetShader(curVs.Get(), nullptr, 0); ctx->VSSetConstantBuffers(0, 1, curCb.GetAddressOf()); ctx->PSSetShader(curPs.Get(), nullptr, 0); ctx->PSSetShaderResources(0, 1, curSrv.GetAddressOf()); ctx->PSSetSamplers(0, 1, curSamp.GetAddressOf()); const float bf[4] = {0, 0, 0, 0}; ctx->OMSetBlendState(curBlend.Get(), bf, 0xFFFFFFFF); ctx->Draw(4, 0); ID3D11RenderTargetView* nullRtv = nullptr; ctx->OMSetRenderTargets(1, &nullRtv, nullptr); ID3D11ShaderResourceView* nullSrv = nullptr; ctx->PSSetShaderResources(0, 1, &nullSrv); } void DuplicationSource::Impl::run() { int writeNext = 0; // worker-local; (published + 1) % kBuffers auto lastProcess = std::chrono::steady_clock::now() - std::chrono::seconds(1); while (!stopFlag.load()) { // Parked (SteamVR session or idle sleep): zero GPU work, and the // OS-side duplication is dropped entirely so DWM owes us nothing // while parked. Resume rides the normal reacquire loop (dup == null) // and republishes the latest desktop within a frame or two. if (pauseFlag.load()) { if (dup) { dup.Reset(); std::lock_guard lk(mu); capturing = false; } Sleep(100); continue; } // Rate cap (M4 step 3 field fix): process at most one desktop frame // per panel period. The full pipeline below (copy + cursor + // GenerateMips of a desktop-size chain + flush) runs PER CAPTURED // FRAME — content redrawing at 130-170 Hz cost 2x+ the GPU work the // panel could ever show and read as a periodic contention episode // in-headset (gpu=/vsw= balloons co-timed with cap= bursts). // AcquireNextFrame coalesces while we sleep, so this stays // latest-frame-wins; the only cost is sub-period content latency. if (minPeriodMs > 0.0) { const double sinceMs = std::chrono::duration( std::chrono::steady_clock::now() - lastProcess).count(); if (sinceMs < minPeriodMs) Sleep((DWORD)(minPeriodMs - sinceMs)); } if (!dup) { if (!acquireDuplication(false)) { { std::lock_guard lk(mu); capturing = false; } Sleep(kReacquireSleepMs); continue; } std::lock_guard lk(mu); reacquires++; } DXGI_OUTDUPL_FRAME_INFO fi{}; ComPtr res; HRESULT hr = dup->AcquireNextFrame(100, &fi, &res); if (hr == DXGI_ERROR_WAIT_TIMEOUT) continue; // desktop unchanged if (FAILED(hr)) { dup.Reset(); // ACCESS_LOST et al — reacquire loop above std::lock_guard lk(mu); capturing = false; continue; } if (fi.PointerShapeBufferSize) updateCursorShape(fi.PointerShapeBufferSize); // Cursor authority = the OS, not duplication metadata (field, // enlyzeam: after failed HDR-enable flaps the metadata position // desynced from the real cursor by a constant offset, and an output // the cursor LEFT kept its last Visible=true metadata — the overlay // froze at the screen edge). GetCursorInfo is exact and current at // composite time; duplication metadata is kept only for the SHAPE. // ptScreenPos is the HOTSPOT in physical pixels (the app declares // per-monitor DPI awareness); the shape draws at its top-left. // flags: SHOWING bit clear covers ShowCursor(-1) hides AND the Win10 // CURSOR_SUPPRESSED state (touch/pen, remote-control software). // Null hCursor with SHOWING set = hidden via a null cursor (field: // Parsec hid the host cursor; the old metadata path kept drawing an // overlay over the remote session). CURSORINFO ci{}; ci.cbSize = sizeof(ci); if (GetCursorInfo(&ci) && (ci.flags & CURSOR_SHOWING) && ci.hCursor) { curVisible = ci.ptScreenPos.x >= outputRect.left && ci.ptScreenPos.x < outputRect.right && ci.ptScreenPos.y >= outputRect.top && ci.ptScreenPos.y < outputRect.bottom; curPos.x = ci.ptScreenPos.x - outputRect.left - curHotX; curPos.y = ci.ptScreenPos.y - outputRect.top - curHotY; } else { curVisible = false; } ComPtr desk; res.As(&desk); const int idx = writeNext; // Non-BGRA8 desktop (HDR scRGB, 10-bit): copy into the desktop-format // intermediate, convert pass fills the ring; SDR BGRA8 copies direct. if (desk) ctx->CopySubresourceRegion( convertOn ? hdrTex.Get() : tex11[idx].Get(), 0, 0, 0, 0, desk.Get(), 0, nullptr); dup->ReleaseFrame(); // release ASAP — holding it blocks DWM if (convertOn) drawConvert(idx); if (overlayCursor.load() && curVisible && curSrv) drawCursor(idx); if (mips > 1) ctx->GenerateMips(srv11[idx].Get()); // CPU-confirm GPU completion, THEN publish — the consumer contract. fenceValue++; ctx4->Signal(fence.Get(), fenceValue); ctx->Flush(); if (SUCCEEDED(fence->SetEventOnCompletion(fenceValue, fenceEvent))) WaitForSingleObject(fenceEvent, 1000); { std::lock_guard lk(mu); published = idx; frames++; capturing = true; } lastProcess = std::chrono::steady_clock::now(); writeNext = (idx + 1) % kBuffers; } } DuplicationSource::DuplicationSource(int outputIndex) : impl_(new Impl) { impl_->outputIndex = outputIndex; } void DuplicationSource::setMinFramePeriodMs(double ms) { impl_->minPeriodMs = ms; } DuplicationSource::~DuplicationSource() { stop(); } bool DuplicationSource::start(ID3D12Device* renderDev) { Impl& im = *impl_; im.dev12 = renderDev; const LUID luid = renderDev->GetAdapterLuid(); ComPtr factory; if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) return false; if (FAILED(factory->EnumAdapterByLuid(luid, IID_PPV_ARGS(&im.adapter)))) { fprintf(stderr, "capture: render adapter not found by LUID\n"); return false; } ComPtr output; if (FAILED(im.adapter->EnumOutputs(im.outputIndex, &output))) { fprintf(stderr, "capture: output %d not found on the render adapter " "(monitor unplugged? wrong --monitor?)\n", im.outputIndex); return false; } if (FAILED(output.As(&im.output1))) return false; DXGI_OUTPUT_DESC od{}; output->GetDesc(&od); UINT flags = D3D11_CREATE_DEVICE_BGRA_SUPPORT; if (FAILED(D3D11CreateDevice(im.adapter.Get(), D3D_DRIVER_TYPE_UNKNOWN, nullptr, flags, nullptr, 0, D3D11_SDK_VERSION, &im.dev11, nullptr, &im.ctx))) { fprintf(stderr, "capture: D3D11 device creation failed\n"); return false; } if (FAILED(im.dev11.As(&im.dev11_5)) || FAILED(im.ctx.As(&im.ctx4))) { fprintf(stderr, "capture: D3D11.4 (fences) unavailable\n"); return false; } if (FAILED(im.dev11_5->CreateFence(0, D3D11_FENCE_FLAG_NONE, IID_PPV_ARGS(&im.fence)))) return false; im.fenceEvent = CreateEventW(nullptr, FALSE, FALSE, nullptr); if (!im.fenceEvent) return false; // Cursor pipeline. ComPtr vs, ps; if (!compile("vsmain", "vs_5_0", &vs) || !compile("psmain", "ps_5_0", &ps)) return false; if (FAILED(im.dev11->CreateVertexShader(vs->GetBufferPointer(), vs->GetBufferSize(), nullptr, &im.curVs))) return false; if (FAILED(im.dev11->CreatePixelShader(ps->GetBufferPointer(), ps->GetBufferSize(), nullptr, &im.curPs))) return false; D3D11_BUFFER_DESC cbd{}; cbd.ByteWidth = 16; cbd.Usage = D3D11_USAGE_DYNAMIC; cbd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; cbd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; if (FAILED(im.dev11->CreateBuffer(&cbd, nullptr, &im.curCb))) return false; // Convert pass (M4 step 5): HDR/10-bit desktops -> BGRA8 ring. ComPtr tvs, tps; if (compileSrc(kConvertShader, sizeof(kConvertShader) - 1, "vsmain", "vs_5_0", &tvs) && compileSrc(kConvertShader, sizeof(kConvertShader) - 1, "psmain", "ps_5_0", &tps)) { im.dev11->CreateVertexShader(tvs->GetBufferPointer(), tvs->GetBufferSize(), nullptr, &im.tmVs); im.dev11->CreatePixelShader(tps->GetBufferPointer(), tps->GetBufferSize(), nullptr, &im.tmPs); } if (FAILED(im.dev11->CreateBuffer(&cbd, nullptr, &im.tmCb))) return false; D3D11_BLEND_DESC bd{}; bd.RenderTarget[0].BlendEnable = TRUE; bd.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA; bd.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA; bd.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD; bd.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ONE; bd.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_INV_SRC_ALPHA; bd.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD; bd.RenderTarget[0].RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL; if (FAILED(im.dev11->CreateBlendState(&bd, &im.curBlend))) return false; D3D11_SAMPLER_DESC sd{}; sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; sd.AddressU = sd.AddressV = sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; if (FAILED(im.dev11->CreateSamplerState(&sd, &im.curSamp))) return false; if (!im.acquireDuplication(true)) { // First acquire may legitimately fail (secure desktop at launch) — the // worker keeps retrying; but we need texture dimensions to proceed, so // fall back to the output's desktop rect for sizing. if (im.w == 0 || im.h == 0) { im.w = od.DesktopCoordinates.right - od.DesktopCoordinates.left; im.h = od.DesktopCoordinates.bottom - od.DesktopCoordinates.top; if (!im.createSharedTextures()) return false; } } printf("capture: output %d \"%ls\" %ux%u, %u mips, %d-buffer ring\n", im.outputIndex, od.DeviceName, im.w, im.h, im.mips, kBuffers); im.stopFlag.store(false); im.worker = std::thread([this] { impl_->run(); }); return true; } void DuplicationSource::setPaused(bool paused) { if (impl_) impl_->pauseFlag.store(paused); } void DuplicationSource::setCursorOverlay(int mode) { if (!impl_) return; const int m = mode < -1 ? -1 : mode > 1 ? 1 : mode; impl_->cursorOverlay.store(m); impl_->overlayCursor.store(m != 0); // live re-resolve (settings watcher) } void DuplicationSource::stop() { if (!impl_) return; Impl& im = *impl_; im.stopFlag.store(true); if (im.worker.joinable()) im.worker.join(); // Release ALL GPU resources so the per-process VRAM actually returns: the // ring textures AND the private D3D11 device are hundreds of MB, and freeing // the textures alone leaves them driver-pooled — the device has to go too. // The Impl OBJECT stays alive (outputIndex, layout, dev12, settings persist), // so cross-thread accessors (stats / setPaused / setCursorOverlay) stay valid // and resume() rebuilds the GPU side. This is the doze VRAM lever (M5). im.dup.Reset(); for (int i = 0; i < kBuffers; i++) { im.tex11[i].Reset(); im.rtv11[i].Reset(); im.srv11[i].Reset(); im.tex12[i].Reset(); } im.hdrTex.Reset(); im.hdrSrv.Reset(); im.tmVs.Reset(); im.tmPs.Reset(); im.tmCb.Reset(); im.curVs.Reset(); im.curPs.Reset(); im.curCb.Reset(); im.curBlend.Reset(); im.curSamp.Reset(); im.curTex.Reset(); im.curSrv.Reset(); im.fence.Reset(); if (im.fenceEvent) { CloseHandle(im.fenceEvent); im.fenceEvent = nullptr; } im.output1.Reset(); im.adapter.Reset(); im.ctx4.Reset(); im.ctx.Reset(); im.dev11_5.Reset(); im.dev11.Reset(); // device last — frees once all its resources are released { std::lock_guard lk(im.mu); im.published = -1; // no valid frame -> the consumer skips this source im.capturing = false; } im.w = im.h = 0; // force a full createSharedTextures on resume im.curW = im.curH = 0; // force cursor shape re-upload } bool DuplicationSource::resume() { // Rebuild the GPU side after stop() freed it (doze wake). Reuses the cached // D3D12 render device; ~tens of ms (D3D11 device + duplication reacquire), // hidden by the firmware wake brightness sweep. The object/outputIndex/layout // persisted across the stop, so this is a pure GPU re-init. if (!impl_ || !impl_->dev12) return false; // No-op if the source is still running (was never stop()'d — e.g. a wake out // of the SteamVR reclaim-into-doze path, where capture was only paused). // Calling start() on a live source assigns over a joinable worker thread -> // std::terminate. dev11 is null iff stop() released the GPU side. if (impl_->dev11) return true; return start(impl_->dev12); } uint32_t DuplicationSource::width() const { return impl_->w; } uint32_t DuplicationSource::height() const { return impl_->h; } uint32_t DuplicationSource::mipLevels() const { return impl_->mips; } uint32_t DuplicationSource::textureCount() const { return kBuffers; } ID3D12Resource* DuplicationSource::texture(uint32_t i) const { return i < kBuffers ? impl_->tex12[i].Get() : nullptr; } int DuplicationSource::latest(uint32_t* gen) const { std::lock_guard lk(impl_->mu); if (gen) *gen = impl_->generation; return impl_->published; } DuplicationSource::Stats DuplicationSource::stats() const { std::lock_guard lk(impl_->mu); Stats s; s.frames = impl_->frames; s.reacquires = impl_->reacquires; s.capturing = impl_->capturing; return s; } } // namespace sauna