Introduction
Lots of stuff out there about it. Here is a simple, single header file, hardware accelerated. If using Windows 8 or later, you can easily include it in your projects.
Requirements
Video Capture
We need to enumerate our adapters and the number of the monitors, with the aid of DXGI:
static void GetAdapters(std::vector<CComPtr<IDXGIAdapter1>>& a)
{
CComPtr<IDXGIFactory1> df;
CreateDXGIFactory1(__uuidof(IDXGIFactory1),(void**)&df);
a.clear();
if (!df)
return;
int L = 0;
for (;;)
{
CComPtr<IDXGIAdapter1> lDxgiAdapter;
df->EnumAdapters1(L, &lDxgiAdapter);
if (!lDxgiAdapter)
break;
L++;
a.push_back(lDxgiAdapter);
}
}
Then, we would instantiate a DirectX 11 device with one of them, or the default:
HRESULT CreateDirect3DDevice(IDXGIAdapter1* g)
{
HRESULT hr = S_OK;
D3D_DRIVER_TYPE DriverTypes[] =
{
D3D_DRIVER_TYPE_HARDWARE,
D3D_DRIVER_TYPE_WARP,
D3D_DRIVER_TYPE_REFERENCE,
};
UINT NumDriverTypes = ARRAYSIZE(DriverTypes);
D3D_FEATURE_LEVEL FeatureLevels[] =
{
D3D_FEATURE_LEVEL_11_0,
D3D_FEATURE_LEVEL_10_1,
D3D_FEATURE_LEVEL_10_0,
D3D_FEATURE_LEVEL_9_3,
D3D_FEATURE_LEVEL_9_2,
D3D_FEATURE_LEVEL_9_1
};
UINT NumFeatureLevels = ARRAYSIZE(FeatureLevels);
D3D_FEATURE_LEVEL FeatureLevel;
for (UINT DriverTypeIndex = 0; DriverTypeIndex < NumDriverTypes; ++DriverTypeIndex)
{
hr = D3D11CreateDevice(g, DriverTypes[DriverTypeIndex],
nullptr, D3D11_CREATE_DEVICE_VIDEO_SUPPORT, FeatureLevels, NumFeatureLevels,
D3D11_SDK_VERSION, &device, &FeatureLevel, &context);
if (SUCCEEDED(hr))
{
break;
}
}
if (FAILED(hr))
return hr;
return S_OK;
}
We want to create the Desktop Duplication of the output then:
bool Prepare(UINT Output = 0)
{
CComPtr<IDXGIDevice> lDxgiDevice;
lDxgiDevice = device;
if (!lDxgiDevice)
return 0;
CComPtr<IDXGIAdapter> lDxgiAdapter;
auto hr = lDxgiDevice->GetParent(
__uuidof(IDXGIAdapter),
reinterpret_cast<void**>(&lDxgiAdapter));
if (FAILED(hr))
return 0;
lDxgiDevice = 0;
CComPtr<IDXGIOutput> lDxgiOutput;
hr = lDxgiAdapter->EnumOutputs(Output, &lDxgiOutput);
if (FAILED(hr))
return 0;
lDxgiAdapter = 0;
DXGI_OUTPUT_DESC lOutputDesc;
hr = lDxgiOutput->GetDesc(&lOutputDesc);
CComPtr<IDXGIOutput1> lDxgiOutput1;
lDxgiOutput1 = lDxgiOutput;
if (!lDxgiOutput1)
return 0;
lDxgiOutput = 0;
hr = lDxgiOutput1->DuplicateOutput(
device,
&lDeskDupl);
if (FAILED(hr))
return 0;
lDxgiOutput1 = 0;
lDeskDupl->GetDesc(&lOutputDuplDesc);
D3D11_TEXTURE2D_DESC desc = {};
desc.Width = lOutputDuplDesc.ModeDesc.Width;
desc.Height = lOutputDuplDesc.ModeDesc.Height;
desc.Format = lOutputDuplDesc.ModeDesc.Format;
desc.ArraySize = 1;
desc.BindFlags = D3D11_BIND_FLAG::D3D11_BIND_RENDER_TARGET;
desc.MiscFlags = D3D11_RESOURCE_MISC_GDI_COMPATIBLE;
desc.SampleDesc.Count = 1;
desc.SampleDesc.Quality = 0;
desc.MipLevels = 1;
desc.CPUAccessFlags = 0;
desc.Usage = D3D11_USAGE_DEFAULT;
hr = device->CreateTexture2D(&desc, NULL, &lGDIImage);
if (FAILED(hr))
return 0;
if (lGDIImage == nullptr)
return 0;
desc.Width = lOutputDuplDesc.ModeDesc.Width;
desc.Height = lOutputDuplDesc.ModeDesc.Height;
desc.Format = lOutputDuplDesc.ModeDesc.Format;
desc.ArraySize = 1;
desc.BindFlags = 0;
desc.MiscFlags = 0;
desc.SampleDesc.Count = 1;
desc.SampleDesc.Quality = 0;
desc.MipLevels = 1;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
desc.Usage = D3D11_USAGE_STAGING;
hr = device->CreateTexture2D(&desc, NULL, &lDestImage);
if (FAILED(hr))
return 0;
if (lDestImage == nullptr)
return 0;
return 1;
}
To get the screenshot, we loop:
hr = cap.lDeskDupl->AcquireNextFrame(
0,
&lFrameInfo,
&lDesktopResource);
if (hr == DXGI_ERROR_WAIT_TIMEOUT)
hr = S_OK;
if (FAILED(hr))
break;
if (lDesktopResource && !cap.Get(lDesktopResource, dp.Cursor,
dp.rx.right && dp.rx.bottom ? &dp.rx : 0))
break;
The method get()
will return us the bitmap, cursor optionally included and cropped:
bool Get(IDXGIResource* lDesktopResource,bool Curs,RECT* rcx = 0)
{
CComPtr<ID3D11Texture2D> lAcquiredDesktopImage;
if (!lDesktopResource)
return 0;
auto hr = lDesktopResource->QueryInterface(IID_PPV_ARGS(&lAcquiredDesktopImage));
if (!lAcquiredDesktopImage)
return 0;
lDesktopResource = 0;
context->CopyResource(lGDIImage, lAcquiredDesktopImage);
CComPtr<IDXGISurface1> lIDXGISurface1;
lIDXGISurface1 = lGDIImage;
if (!lIDXGISurface1)
return 0;
CURSORINFO lCursorInfo = { 0 };
lCursorInfo.cbSize = sizeof(lCursorInfo);
auto lBoolres = GetCursorInfo(&lCursorInfo);
if (lBoolres == TRUE)
{
if (lCursorInfo.flags == CURSOR_SHOWING && Curs)
{
auto lCursorPosition = lCursorInfo.ptScreenPos;
HDC lHDC;
lIDXGISurface1->GetDC(FALSE, &lHDC);
DrawIconEx(
lHDC,
lCursorPosition.x,
lCursorPosition.y,
lCursorInfo.hCursor,
0,
0,
0,
0,
DI_NORMAL | DI_DEFAULTSIZE);
lIDXGISurface1->ReleaseDC(nullptr);
}
}
context->CopyResource(lDestImage, lGDIImage);
D3D11_MAPPED_SUBRESOURCE resource;
UINT subresource = D3D11CalcSubresource(0, 0, 0);
hr = context->Map(lDestImage, subresource, D3D11_MAP_READ_WRITE, 0, &resource);
if (FAILED(hr))
return 0;
auto sz = lOutputDuplDesc.ModeDesc.Width
* lOutputDuplDesc.ModeDesc.Height * 4;
auto sz2 = sz;
buf.resize(sz);
if (rcx)
{
sz2 = (rcx->right - rcx->left) * (rcx->bottom - rcx->top) * 4;
buf.resize(sz2);
sz = sz2;
}
UINT lBmpRowPitch = lOutputDuplDesc.ModeDesc.Width * 4;
if (rcx)
lBmpRowPitch = (rcx->right - rcx->left) * 4;
UINT lRowPitch = std::min<UINT>(lBmpRowPitch, resource.RowPitch);
BYTE* sptr = reinterpret_cast<BYTE*>(resource.pData);
BYTE* dptr = buf.data() + sz - lBmpRowPitch;
if (rcx)
sptr += rcx->left * 4;
for (size_t h = 0; h < lOutputDuplDesc.ModeDesc.Height; ++h)
{
if (rcx && h < (size_t)rcx->top)
{
sptr += resource.RowPitch;
continue;
}
if (rcx && h >= (size_t)rcx->bottom)
break;
memcpy_s(dptr, lBmpRowPitch, sptr, lRowPitch);
sptr += resource.RowPitch;
dptr -= lBmpRowPitch;
}
context->Unmap(lDestImage, subresource);
return 1;
}
After that, you can feed the "buf
" data into media foundation's sink writer.
Audio Capture
You will use the IAudioClient to get an IAudioCaptureClient
to record audio in a separated thread.
void ThreadLoopCapture()
{
UINT64 up, uq;
while (Capturing)
{
if (hEv)
WaitForSingleObject(hEv, INFINITE);
if (!Capturing)
break;
auto hr = cap->GetBuffer(&pData, &framesAvailable, &flags, &up, &uq);
if (FAILED(hr))
break;
if (framesAvailable == 0)
continue;
auto ThisAudioBytes = framesAvailable * wfx.Format.nChannels *
wfx.Format.wBitsPerSample/8 ;
AudioDataX->PushX((const char*)pData, ThisAudioBytes);
cap->ReleaseBuffer(framesAvailable);
}
CapturingFin1 = true;
}
If the recording device is a playback device through loopback, you have to ensure that something is played, otherwise the Core Audio API records nothing. So we have to play silence:
void PlaySilence(REFERENCE_TIME rt)
{
rt /= 10000;
auto ns = (wfx.Format.nSamplesPerSec * rt);
ns /= 1000;
while (Capturing)
{
if (!ren)
break;
Sleep((DWORD)(rt / 2));
if (!Capturing)
break;
UINT32 numFramesPadding = 0;
auto hr = ac2->GetCurrentPadding(&numFramesPadding);
if (FAILED(hr))
break;
auto numFramesAvailable = ns - numFramesPadding;
if (!numFramesAvailable)
continue;
BYTE* db = 0;
hr = ren->GetBuffer((UINT32)numFramesAvailable, &db);
if (FAILED(hr))
break;
auto bs = numFramesAvailable * wfx.Format.nChannels * wfx.Format.wBitsPerSample / 8;
memset(db, 0,(size_t) bs);
ren->ReleaseBuffer((UINT32)numFramesAvailable, 0); }
CapturingFin2 = true;
}
When there are many audio streams, you have to mix them in a single buffer. This is done using my own REBUFFER
and MIXBUFFER
s:
struct REBUFFER
{
std::recursive_mutex m;
std::vector<char> d;
AHANDLE Has = CreateEvent(0, TRUE, 0, 0);
MIXBUFFER<float> mb;
void FinMix(size_t sz, float* A = 0)
{
mb.Fin(sz / sizeof(float), A);
}
size_t PushX(const char* dd, size_t sz, float* A = 0, float V = 1.0f)
{
REBUFFERLOCK l(m);
auto s = d.size();
d.resize(s + sz);
if (dd)
memcpy(d.data() + s, dd, sz);
else
memset(d.data() + s, 0, sz);
char* a1 = d.data();
a1 += s;
mb.Set((float*)a1);
mb.count = 1;
SetEvent(Has);
float* b = (float*)(d.data() + s);
if (V > 1.01f || V < 0.99f)
{
auto st = sz / sizeof(float);
for (size_t i = 0; i < st; i++)
b[i] *= V;
}
if (A)
{
*A = Peak<float>(b, sz / sizeof(float));
}
return s + sz;
}
size_t Av()
{
REBUFFERLOCK l(m);
return d.size();
}
size_t PopX(char* trg, size_t sz, DWORD wi = 0, bool NR = false)
{
if (wi)
WaitForSingleObject(Has, wi);
REBUFFERLOCK l(m);
if (sz >= d.size())
sz = d.size();
if (sz == 0)
return 0;
if (trg)
memcpy(trg, d.data(), sz);
if (NR == false)
d.erase(d.begin(), d.begin() + sz);
if (d.size() == 0)
ResetEvent(Has);
return sz;
}
void Clear()
{
REBUFFERLOCK l(m);
d.clear();
}
};
If you have audio, video is synchronized to it.
HDR Support
When your display is HDR, the following happens:
lDeskDupl
->GetDesc(&lOutputDuplDesc);
returns a description with a format of DXGI_FORMAT_R16G16B16A16_FLOAT
which is not GDI compatible. - The cursor can't be drawn, hence the
Cursor
parameter is ignored. - Media Foundation can't create a HDR video. Therefore, you have to install Turbo Play and use my own Media Foundation library which can use NVidia Encoder to create true HDR-10 videos. If you install Turbo Play, in the installation directory, there's a file nvh64.dll. Run
regsvr32
with it as administrator and my filter will be registered for you to use with the Screen Capture.
Using the Library
#include "stdafx.h"
#include "capture.hpp"
#include <iostream>
int wmain()
{
CoInitializeEx(0, COINIT_APARTMENTTHREADED);
MFStartup(MF_VERSION);
std::cout << "Capturing screen for 10 seconds...";
DESKTOPCAPTUREPARAMS dp;
dp.f = L"capture.mp4";
dp.EndMS = 10000;
DesktopCapture(dp);
std::cout << "Done.\r\n";
return 0;
}
Where the DESKTOPCAPTUREPARAMS
is defined like that:
struct DESKTOPCAPTUREPARAMS
{
bool HasVideo = 1;
bool HasAudio = 1;
std::vector<std::tuple<std::wstring, std::vector<int>>> AudioFrom;
GUID VIDEO_ENCODING_FORMAT = MFVideoFormat_H264;
GUID AUDIO_ENCODING_FORMAT = MFAudioFormat_MP3;
std::wstring f;
void* cb = 0;
std::function<HRESULT(const BYTE* d, size_t sz,void* cb)> Streamer;
std::function<HRESULT(const BYTE* d, size_t sz,void* cb)> Framer;
std::function<void(IMFAttributes* a)> PrepareAttributes;
int fps = 25;
int NumThreads = 0;
int Qu = -1;
int vbrm = 0;
int vbrq = 0;
int BR = 4000;
int NCH = 2;
int SR = 44100;
int ABR = 192;
bool Cursor = true;
RECT rx = { 0,0,0,0 };
HWND hWnd = 0;
IDXGIAdapter1* ad = 0;
UINT nOutput = 0;
unsigned long long StartMS = 0; unsigned long long EndMS = 0; bool MustEnd = false;
bool Pause = false;
};
Where:
HasVideo = 1
-> You are capturing video. If this is set, the output file must be an MP4 or an ASF regardless of whether you have audio or not. HasAudio = 1
-> You are capturing audio. If this is set and you do not have a video, the output file must be an MP3 or FLAC. AudioFrom
= a vector of which audio devices you want to capture. Each element is a tuple of the device unique ID (as returned by the enumeration, see VISTAMIXERS::EnumVistaMixers()
) and a vector of the channels you want to record from.
The library can also record from a playback device (like your speakers) in loopback. You can specify multiple sources of recording and the library will mix them all into the final audio stream.
VIDEO_ENCODING_FORMAT
-> One of MFVideoFormat_H264
, MFVideoFormat_HEVC
, MFVideoFormat_VP90
, MFVideoFormat_VP80
. Use HEVC for HDR. AUDIO_ENCODING_FORMAT
-> One of MFAudioFormat_MP3
or MFAudioFormat_FLAC
or MFAudioFormat_AAC
. MP3 and AAC support only 44100/48000 2 channel output. f
-> target file name (MP3/FLAC for audio only, MP4/ASF else) fps
-> Frames per second NumThreads
-> Threads for the video encoder, 0 default. Can be 0-16. Qu
-> If >= 0
and <= 0
, Quality Vs Speed video factor vbrm
and vbrq
-> If 2
, then vbrq
is a quality value between 0
and 100
(BR is ignored) BR
-> Video bitrate in KBps, default 4000. If vbrm
is 2
, BR
is ignored NCH
-> Audio output channels SR
-> Audio output sample rate ABR
-> Audio bitrate in Kbps for MP3 Cursor
-> true to capture the cursor. Ignored if HDR. rx
-> If not {0}, capture this specific rect only hWnd
-> If not {0}, capture this HWND
only. If HWND
is 0
and rx = {0}
, the entire screen is captured ad
-> If not 0
, specifies which adapter you want to capture if you have more than 1 adapter nOutput
-> The index of the monitor to capture. 0
is the first monitor. For multiple monitors, this specifies the monitor. EndMS
-> If not 0
, the library stops when EndMs
milliseconds have been captured. Else, you have to stop the library by setting "MustEnd
" to true
. MustEnd
-> Set to true
for the library to stop capturing Pause
-> If true
, capture is paused
If you want to capture to a buffer, you must leave the "f
" parameter empty and use the Streamer
parameter. This calls your callback as long as you return S_OK
. If you use an ASF container, then you need not do anything. If you want to use an MP4 stream, then you must prepare the streaming sample description (see this post). You can use this to stream your desktop over HTTP.
Capturing Frames
Instead of capturing compressed video, you may use the 'Framer' callback. This returns a raw RGBA upside down array of your requested resolution as long as you return S_FALSE
. (or a half-float 64-bit array if HDR). Once you return S_OK
, then the function returns.
History
- 20th March, 2024: Added HDR support
- 2nd April, 2021: Capturing to stream, capturing frames
- 18th January, 2020: First release