[英]Get the number of logical CPU cores sharing a cache (L1, L2, L3)
下面是一些使用GetLogicalProcessorInformation 檢測Windows 上 L1、L2 和 L3 CPU 緩存大小的 C++ 代碼:
typedef BOOL (WINAPI *LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI) GetProcAddress(
GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi)
{
DWORD bytes = 0;
glpi(0, &bytes);
size_t size = bytes / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
vector<SYSTEM_LOGICAL_PROCESSOR_INFORMATION> info(size);
glpi(info.data(), &bytes);
for (size_t i = 0; i < size; i++)
{
if (info[i].Relationship == RelationCache)
{
if (info[i].Cache.Level == 1)
l1_cache_Size = info[i].Cache.Size;
if (info[i].Cache.Level == 2)
l2_cache_Size = info[i].Cache.Size;
if (info[i].Cache.Level == 3)
l3_cache_Size = info[i].Cache.Size;
}
}
}
作為下一步,我想獲得共享緩存的邏輯 CPU 內核的數量。 在具有超線程的 x64 CPU 上,兩個邏輯 CPU 內核通常共享 L2 緩存,所有邏輯 CPU 內核共享 L3 緩存。
通讀 MSDN 后,我認為GetLogicalProcessorInformationEx
和CACHE_RELATIONSHIP和GROUP_AFFINITY是我正在尋找的數據結構,但在嘗試之后,這些數據結構對我的目的似乎毫無用處。
問題:
有沒有辦法使用 C/C++ 在 Windows 上獲取共享緩存的邏輯 CPU 內核的數量? (理想情況下不直接使用cpuid
)
解決方法:
可以使用GetLogicalProcessorInformationEx
以及CACHE_RELATIONSHIP和GROUP_AFFINITY數據結構獲取共享緩存的邏輯 CPU 內核的數量。 GROUP_AFFINITY.Mask
值包含為共享當前緩存 ( RelationCache
) 的每個 CPU 內核設置的一位。 作為大多數具有超線程的 Intel CPU 的示例GROUP_AFFINITY.Mask
將包含為 L2 緩存設置的 2 位和為具有 4 個物理 CPU 內核和 8 個邏輯 CPU 內核的 CPU 設置的 L3 緩存的 8 位。
這是C++代碼:
#include <windows.h>
#include <vector>
#include <iostream>
using namespace std;
typedef BOOL (WINAPI *LPFN_GLPI)(LOGICAL_PROCESSOR_RELATIONSHIP,
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
int main()
{
LPFN_GLPI glpi = (LPFN_GLPI) GetProcAddress(
GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
if (!glpi)
return 1;
DWORD bytes = 0;
glpi(RelationAll, 0, &bytes);
vector<char> buffer(bytes);
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info;
if (!glpi(RelationAll, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) &buffer[0], &bytes))
return 1;
for (size_t i = 0; i < bytes; i += info->Size)
{
info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*) &buffer[i];
if (info->Relationship == RelationCache &&
(info->Cache.Type == CacheData ||
info->Cache.Type == CacheUnified))
{
cout << "info->Cache.Level: " << (int) info->Cache.Level << endl;
cout << "info->Cache.CacheSize: " << (int) info->Cache.CacheSize << endl;
cout << "info->Cache.GroupMask.Group: " << info->Cache.GroupMask.Group << endl;
cout << "info->Cache.GroupMask.Mask: " << info->Cache.GroupMask.Mask << endl << endl;
}
}
return 0;
}
注意事項:
我發現在虛擬機中運行 Windows 時,上面的代碼無法正確檢測共享緩存的 CPU 內核數,例如在具有 2 個虛擬 CPU 內核的 VM 上,上面的代碼報告每個邏輯 CPU 內核都有一個私有 L1 ,L2和L3緩存。
@RbMm:但 CACHE_RELATIONSHIP 包含所需的所有信息。 邏輯 CPU 核心數 = Cache->GroupMask.Mask 中設置的位數
我已經在 AppVeyor CI 上測試過這個(甚至在發布到 stackoverflow 之前)。 這是 x64 CPU 的輸出:
info->Cache.Level: 1
info->Cache.CacheSize: 32768
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 1
info->Cache.Level: 1
info->Cache.CacheSize: 32768
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 1
info->Cache.Level: 2
info->Cache.CacheSize: 262144
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 1
info->Cache.Level: 3
info->Cache.CacheSize: 31457280
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 1
info->Cache.Level: 1
info->Cache.CacheSize: 32768
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 2
info->Cache.Level: 1
info->Cache.CacheSize: 32768
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 2
info->Cache.Level: 2
info->Cache.CacheSize: 262144
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 2
info->Cache.Level: 3
info->Cache.CacheSize: 31457280
info->Cache.GroupMask.Group: 0
info->Cache.GroupMask.Mask: 2
或者:
| Cache Level | Processor 1 | Processor 2 |
|-------------|--------------------|--------------------|
| L1 | 32 KB Data | 32 KB Data |
| | 32 KB Instruction | 32 KB Instruction |
|-------------|--------------------|--------------------|
| L2 | 256 KB Unified | 256 KB Unified |
|-------------|--------------------|--------------------|
| L3 | 30 MB Unified | 30 MB Unified |
根據 MSDN 文檔:
GroupMask.Mask - 一個位圖,指定指定組內零個或多個處理器的關聯。
基於此文檔,我期望 L3 緩存使用不同的GroupMask.Mask
,但上面的輸出未顯示這一點。 對我來說GroupMask.Mask
的數據毫無意義!
結果可能取決於特定的處理器和 Windows 版本,但我在 2 個處理器上測試 (win10) 並得到正確的結果:
i5(2 核,4 線程):
ProcessorPackage
[G0 000000000000000F { #3, #2, #1, #0}]
ProcessorCore HP=1 0
[G0 0000000000000003 { #1, #0}]
Cache L1 8000 40 [G0 0000000000000003 { #1, #0}] A=8 Data
Cache L1 8000 40 [G0 0000000000000003 { #1, #0}] A=8 Instruction
Cache L2 40000 40 [G0 0000000000000003 { #1, #0}] A=8 Unified
Cache L3 300000 40 [G0 000000000000000F { #3, #2, #1, #0}] A=c Unified
ProcessorCore HP=1 0
[G0 000000000000000C { #3, #2}]
Cache L1 8000 40 [G0 000000000000000C { #3, #2}] A=8 Data
Cache L1 8000 40 [G0 000000000000000C { #3, #2}] A=8 Instruction
Cache L2 40000 40 [G0 000000000000000C { #3, #2}] A=8 Unified
NumaNode #0 [G0 000000000000000F { #3, #2, #1, #0}]
Group:1/1
4/4 [000000000000000F { #3, #2, #1, #0}]
i7(4 核,8 線程):
ProcessorPackage
[G0 00000000000000FF { #7, #6, #5, #4, #3, #2, #1, #0}]
ProcessorCore HP=1 0
[G0 0000000000000003 { #1, #0}]
Cache L1 8000 40 [G0 0000000000000003 { #1, #0}] A=8 Data
Cache L1 8000 40 [G0 0000000000000003 { #1, #0}] A=8 Instruction
Cache L2 40000 40 [G0 0000000000000003 { #1, #0}] A=4 Unified
Cache L3 800000 40 [G0 00000000000000FF { #7, #6, #5, #4, #3, #2, #1, #0}] A=10 Unified
ProcessorCore HP=1 0
[G0 000000000000000C { #3, #2}]
Cache L1 8000 40 [G0 000000000000000C { #3, #2}] A=8 Data
Cache L1 8000 40 [G0 000000000000000C { #3, #2}] A=8 Instruction
Cache L2 40000 40 [G0 000000000000000C { #3, #2}] A=4 Unified
ProcessorCore HP=1 0
[G0 0000000000000030 { #5, #4}]
Cache L1 8000 40 [G0 0000000000000030 { #5, #4}] A=8 Data
Cache L1 8000 40 [G0 0000000000000030 { #5, #4}] A=8 Instruction
Cache L2 40000 40 [G0 0000000000000030 { #5, #4}] A=4 Unified
ProcessorCore HP=1 0
[G0 00000000000000C0 { #7, #6}]
Cache L1 8000 40 [G0 00000000000000C0 { #7, #6}] A=8 Data
Cache L1 8000 40 [G0 00000000000000C0 { #7, #6}] A=8 Instruction
Cache L2 40000 40 [G0 00000000000000C0 { #7, #6}] A=4 Unified
NumaNode #0 [G0 00000000000000FF { #7, #6, #5, #4, #3, #2, #1, #0}]
Group:1/1
8/8 [00000000000000FF { #7, #6, #5, #4, #3, #2, #1, #0}]
代碼:
void FormatMask(KAFFINITY Mask, PSTR sz)
{
sz += sprintf(sz, "%p {", (PVOID)Mask);
ULONG i = sizeof(KAFFINITY) * 8;
do
{
if (_bittest((PLONG)&Mask, --i))
{
sz += sprintf(sz, " #%u,", i);
}
} while (i);
*--sz = '}';
}
void DumpCpuInfo()
{
static PCSTR szCacheType[] = {
"Unified",
"Instruction",
"Data",
"Trace"
};
char szMask[64 * 5 + 19];
ULONG cb = 0, rcb = 0x400;
static volatile UCHAR guz;
PVOID stack = alloca(guz);
union {
PVOID Buffer;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX plpi;
};
do
{
if (cb < rcb) rcb = cb = RtlPointerToOffset(Buffer = alloca(rcb - cb), stack);
if (GetLogicalProcessorInformationEx(::RelationAll, plpi, &rcb))
{
DWORD Size;
do
{
Size = plpi->Size;
union {
PPROCESSOR_RELATIONSHIP Processor;
PNUMA_NODE_RELATIONSHIP NumaNode;
PCACHE_RELATIONSHIP Cache;
PGROUP_RELATIONSHIP Group;
PVOID pv;
};
pv = &plpi->Processor;
switch (plpi->Relationship)
{
case RelationProcessorPackage:
DbgPrint("ProcessorPackage\n");
goto __0;
case RelationProcessorCore:
DbgPrint("ProcessorCore HP=%x %x\n",
Processor->Flags & LTP_PC_SMT ? 1 : 0, Processor->EfficiencyClass);
__0:
if (WORD GroupCount = Processor->GroupCount)
{
PGROUP_AFFINITY GroupMask = Processor->GroupMask;
do
{
FormatMask(GroupMask->Mask, szMask);
DbgPrint("\t[G%u %s]\n", GroupMask->Group, szMask);
} while (GroupMask++, --GroupCount);
}
break;
case RelationNumaNode:
FormatMask(NumaNode->GroupMask.Mask, szMask);
DbgPrint("NumaNode #%u [G%u %s]\n",
NumaNode->NodeNumber, NumaNode->GroupMask.Group, szMask);
break;
case RelationGroup:
DbgPrint("Group:%u/%u\n", Group->ActiveGroupCount, Group->MaximumGroupCount);
if (WORD ActiveGroupCount = Group->ActiveGroupCount)
{
PPROCESSOR_GROUP_INFO GroupInfo = Group->GroupInfo;
do
{
FormatMask(GroupInfo->ActiveProcessorMask, szMask);
DbgPrint("\t%u/%u [%s]\n",
GroupInfo->ActiveProcessorCount,
GroupInfo->MaximumProcessorCount, szMask);
} while (GroupInfo, --ActiveGroupCount);
}
break;
case RelationCache:
FormatMask(Cache->GroupMask.Mask, szMask);
DbgPrint("Cache L%u %8x %2x [G%u %s] A=%x %s\n",
Cache->Level,
Cache->CacheSize, Cache->LineSize,
Cache->GroupMask.Group, szMask,
Cache->Associativity,
szCacheType[Cache->Type % RTL_NUMBER_OF(szCacheType)]
);
break;
}
Buffer = RtlOffsetToPointer(plpi, Size);
} while (rcb -= Size);
break;
}
} while (GetLastError() == ERROR_INSUFFICIENT_BUFFER);
}
有一個來自 boost 庫的替代解決方案。
// number of logical cores
auto logical = boost::thread::hardware_concurrency();
// number of physical cores
auto physical = boost::thread::physical_concurrency();
但不考慮緩存。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.