c++ - 如何将 C++ 写入速度加速到 CrystalDiskMark 测试的速度?
问题描述
现在我在内存中每秒获得大约 3.6GB 的数据,我需要将它们连续写入我的 SSD。我用 CrystalDiskMark 测试了我的 SSD 的写入速度,几乎是每秒 6GB,所以我认为这项工作应该没有那么难。
![我的SSD测试结果][1]:
[1] https://plus.google.com/u/0/photos/photo/106876803948041178149/6649598887699308850?authkey=CNbb5KjF8-jxJQ “测试结果”:
我的电脑是 Windows 10,使用 Visual Studio 2017 社区。
我找到了这个问题并尝试了投票最高的答案。不幸的是,他的option_2的写入速度只有1s/GB左右,远低于CrystalDiskMark测试的速度。然后我尝试了内存映射,这次写入速度变快了,大约 630ms/GB,但还是慢了很多。然后我尝试了多线程内存映射,好像线程数为4的时候,速度大概是350ms/GB,加上线程数,写入速度就没有再上去了。
内存映射代码:
#include <fstream>
#include <chrono>
#include <vector>
#include <cstdint>
#include <numeric>
#include <random>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <thread>
#include <windows.h>
#include <sstream>
// Generate random data
std::vector<int> GenerateData(std::size_t bytes) {
assert(bytes % sizeof(int) == 0);
std::vector<int> data(bytes / sizeof(int));
std::iota(data.begin(), data.end(), 0);
std::shuffle(data.begin(), data.end(), std::mt19937{ std::random_device{}() });
return data;
}
// Memory mapping
int map_write(int* data, int size, int id){
char* name = (char*)malloc(100);
sprintf_s(name, 100, "D:\\data_%d.bin",id);
HANDLE hFile = CreateFile(name, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);//
if (hFile == INVALID_HANDLE_VALUE){
return -1;
}
Sleep(0);
DWORD dwFileSize = size;
char* rname = (char*)malloc(100);
sprintf_s(rname, 100, "data_%d.bin", id);
HANDLE hFileMap = CreateFileMapping(hFile, NULL, PAGE_READWRITE, 0, dwFileSize, rname);//create file
if (hFileMap == NULL) {
CloseHandle(hFile);
return -2;
}
PVOID pvFile = MapViewOfFile(hFileMap, FILE_MAP_WRITE, 0, 0, 0);//Acquire the address of file on disk
if (pvFile == NULL) {
CloseHandle(hFileMap);
CloseHandle(hFile);
return -3;
}
PSTR pchAnsi = (PSTR)pvFile;
memcpy(pchAnsi, data, dwFileSize);//memery copy
UnmapViewOfFile(pvFile);
CloseHandle(hFileMap);
CloseHandle(hFile);
return 0;
}
// Multi-thread memory mapping
void Mem2SSD_write(int* data, int size){
int part = size / sizeof(int) / 4;
int index[4];
index[0] = 0;
index[1] = part;
index[2] = part * 2;
index[3] = part * 3;
std::thread ta(map_write, data + index[0], size / 4, 10);
std::thread tb(map_write, data + index[1], size / 4, 11);
std::thread tc(map_write, data + index[2], size / 4, 12);
std::thread td(map_write, data + index[3], size / 4, 13);
ta.join();
tb.join();
tc.join();
td.join();
}
//Test:
int main() {
const std::size_t kB = 1024;
const std::size_t MB = 1024 * kB;
const std::size_t GB = 1024 * MB;
for (int i = 0; i < 10; ++i) {
std::vector<int> data = GenerateData(1 * GB);
auto startTime = std::chrono::high_resolution_clock::now();
Mem2SSD_write(&data[0], 1 * GB);
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
std::cout << "1G writing cost: " << duration << " ms" << std::endl;
}
system("pause");
return 0;
}
所以我想问一下,C++写大文件有没有更快的写法?或者,为什么我不能像 CrystalDiskMark 测试的那样快速写入?CrystalDiskMark 是怎么写的?
任何帮助将不胜感激。谢谢!
解决方案
首先,这不是c++问题,而是 os 相关问题。为了获得最大性能,需要使用操作系统特定的低级 api 调用,这在一般的c++库中不存在。从您的代码中可以清楚地看到您使用的是 windows api,因此搜索 windows 的解决方案有多少。
从CreateFileW
功能:
当
FILE_FLAG_NO_BUFFERING
与 结合使用时FILE_FLAG_OVERLAPPED
,这些标志可提供最大的异步性能,因为 I/O 不依赖于内存管理器的同步操作。
CreateFileW
所以我们需要在通话或FILE_NO_INTERMEDIATE_BUFFERING
通话中使用这两个标志的组合NtCreateFile
还扩展文件大小和有效数据长度需要一些时间,所以如果知道开始时的最终文件会更好 - 只需通过NtSetInformationFile
withFileEndOfFileInformation
或 via SetFileInformationByHandle
with设置文件最终大小FileEndOfFileInfo
。然后使用SetFileValidData
或通过NtSetInformationFile
使用FileValidDataLengthInformation设置有效数据长度。设置有效数据长度需要SE_MANAGE_VOLUME_NAME
在最初打开文件时启用特权(但不是在调用时SetFileValidData
)
还要寻找文件压缩 - 如果文件压缩(如果在压缩文件夹中创建,默认情况下将被压缩)这是非常慢的写入。所以需要通过禁用文件压缩FSCTL_SET_COMPRESSION
那么当我们使用异步 I/O(最快的方式)时,我们不需要创建多个专用线程。相反,我们需要确定并发运行的 I/O 请求数。如果您使用CrystalDiskMark,它实际上会运行CdmResource\diskspd\diskspd64.exe进行测试,这是与其-o<count>
参数对应的(运行diskspd64.exe /? > h.txt
查看参数列表)。
使用非缓冲 I/O使任务更加困难,因为存在 3 个附加要求:
- 传递给 WriteFile 的任何 ByteOffset 必须是扇区大小的倍数。
- 传递给 WriteFile 的长度必须是扇区大小的整数
- 缓冲区必须按照底层设备的对齐要求进行对齐。要获取此信息,请
NtQueryInformationFile
使用FileAlignmentInformation
或GetFileInformationByHandleEx
使用FileAlignmentInfo调用
在大多数情况下,页面对齐的内存也会是扇区对齐的,因为扇区大小大于页面大小的情况很少见。
所以几乎总是使用 VirtualAlloc 函数和多页大小(4,096 字节)分配的缓冲区是可以的。在针对较小代码大小的具体测试中,我使用了这个假设
struct WriteTest
{
enum { opCompression, opWrite };
struct REQUEST : IO_STATUS_BLOCK
{
WriteTest* pTest;
ULONG opcode;
ULONG offset;
};
LONGLONG _TotalSize, _BytesLeft;
HANDLE _hFile;
ULONG64 _StartTime;
void* _pData;
REQUEST* _pRequests;
ULONG _BlockSize;
ULONG _ConcurrentRequestCount;
ULONG _dwThreadId;
LONG _dwRefCount;
WriteTest(ULONG BlockSize, ULONG ConcurrentRequestCount)
{
if (BlockSize & (BlockSize - 1))
{
__debugbreak();
}
_BlockSize = BlockSize, _ConcurrentRequestCount = ConcurrentRequestCount;
_dwRefCount = 1, _hFile = 0, _pRequests = 0, _pData = 0;
_dwThreadId = GetCurrentThreadId();
}
~WriteTest()
{
if (_pData)
{
VirtualFree(_pData, 0, MEM_RELEASE);
}
if (_pRequests)
{
delete [] _pRequests;
}
if (_hFile)
{
NtClose(_hFile);
}
PostThreadMessageW(_dwThreadId, WM_QUIT, 0, 0);
}
void Release()
{
if (!InterlockedDecrement(&_dwRefCount))
{
delete this;
}
}
void AddRef()
{
InterlockedIncrementNoFence(&_dwRefCount);
}
void StartWrite()
{
IO_STATUS_BLOCK iosb;
FILE_VALID_DATA_LENGTH_INFORMATION fvdl;
fvdl.ValidDataLength.QuadPart = _TotalSize;
NTSTATUS status;
if (0 > (status = NtSetInformationFile(_hFile, &iosb, &_TotalSize, sizeof(_TotalSize), FileEndOfFileInformation)) ||
0 > (status = NtSetInformationFile(_hFile, &iosb, &fvdl, sizeof(fvdl), FileValidDataLengthInformation)))
{
DbgPrint("FileValidDataLength=%x\n", status);
}
ULONG offset = 0;
ULONG dwNumberOfBytesTransfered = _BlockSize;
_BytesLeft = _TotalSize + dwNumberOfBytesTransfered;
ULONG ConcurrentRequestCount = _ConcurrentRequestCount;
REQUEST* irp = _pRequests;
_StartTime = GetTickCount64();
do
{
irp->opcode = opWrite;
irp->pTest = this;
irp->offset = offset;
offset += dwNumberOfBytesTransfered;
DoWrite(irp++);
} while (--ConcurrentRequestCount);
}
void FillBuffer(PULONGLONG pu, LONGLONG ByteOffset)
{
ULONG n = _BlockSize / sizeof(ULONGLONG);
do
{
*pu++ = ByteOffset, ByteOffset += sizeof(ULONGLONG);
} while (--n);
}
void DoWrite(REQUEST* irp)
{
LONG BlockSize = _BlockSize;
LONGLONG BytesLeft = InterlockedExchangeAddNoFence64(&_BytesLeft, -BlockSize) - BlockSize;
if (0 < BytesLeft)
{
LARGE_INTEGER ByteOffset;
ByteOffset.QuadPart = _TotalSize - BytesLeft;
PVOID Buffer = RtlOffsetToPointer(_pData, irp->offset);
FillBuffer((PULONGLONG)Buffer, ByteOffset.QuadPart);
AddRef();
NTSTATUS status = NtWriteFile(_hFile, 0, 0, irp, irp, Buffer, BlockSize, &ByteOffset, 0);
if (0 > status)
{
OnComplete(status, 0, irp);
}
}
else if (!BytesLeft)
{
// write end
ULONG64 time = GetTickCount64() - _StartTime;
WCHAR sz[64];
StrFormatByteSizeW((_TotalSize * 1000) / time, sz, RTL_NUMBER_OF(sz));
DbgPrint("end:%S\n", sz);
}
}
static VOID NTAPI _OnComplete(
_In_ NTSTATUS status,
_In_ ULONG_PTR dwNumberOfBytesTransfered,
_Inout_ PVOID Ctx
)
{
reinterpret_cast<REQUEST*>(Ctx)->pTest->OnComplete(status, dwNumberOfBytesTransfered, reinterpret_cast<REQUEST*>(Ctx));
}
VOID OnComplete(NTSTATUS status, ULONG_PTR dwNumberOfBytesTransfered, REQUEST* irp)
{
if (0 > status)
{
DbgPrint("OnComplete[%x]: %x\n", irp->opcode, status);
}
else
switch (irp->opcode)
{
default:
__debugbreak();
case opCompression:
StartWrite();
break;
case opWrite:
if (dwNumberOfBytesTransfered == _BlockSize)
{
DoWrite(irp);
}
else
{
DbgPrint(":%I64x != %x\n", dwNumberOfBytesTransfered, _BlockSize);
}
}
Release();
}
NTSTATUS Create(POBJECT_ATTRIBUTES poa, ULONGLONG size)
{
if (!(_pRequests = new REQUEST[_ConcurrentRequestCount]) ||
!(_pData = VirtualAlloc(0, _BlockSize * _ConcurrentRequestCount, MEM_COMMIT, PAGE_READWRITE)))
{
return STATUS_INSUFFICIENT_RESOURCES;
}
ULONGLONG sws = _BlockSize - 1;
LARGE_INTEGER as;
_TotalSize = as.QuadPart = (size + sws) & ~sws;
HANDLE hFile;
IO_STATUS_BLOCK iosb;
NTSTATUS status = NtCreateFile(&hFile,
DELETE|FILE_GENERIC_READ|FILE_GENERIC_WRITE&~FILE_APPEND_DATA,
poa, &iosb, &as, 0, 0, FILE_OVERWRITE_IF,
FILE_NON_DIRECTORY_FILE|FILE_NO_INTERMEDIATE_BUFFERING, 0, 0);
if (0 > status)
{
return status;
}
_hFile = hFile;
if (0 > (status = RtlSetIoCompletionCallback(hFile, _OnComplete, 0)))
{
return status;
}
static USHORT cmp = COMPRESSION_FORMAT_NONE;
REQUEST* irp = _pRequests;
irp->pTest = this;
irp->opcode = opCompression;
AddRef();
status = NtFsControlFile(hFile, 0, 0, irp, irp, FSCTL_SET_COMPRESSION, &cmp, sizeof(cmp), 0, 0);
if (0 > status)
{
OnComplete(status, 0, irp);
}
return status;
}
};
void WriteSpeed(POBJECT_ATTRIBUTES poa, ULONGLONG size, ULONG BlockSize, ULONG ConcurrentRequestCount)
{
BOOLEAN b;
NTSTATUS status = RtlAdjustPrivilege(SE_MANAGE_VOLUME_PRIVILEGE, TRUE, FALSE, &b);
if (0 <= status)
{
status = STATUS_INSUFFICIENT_RESOURCES;
if (WriteTest * pTest = new WriteTest(BlockSize, ConcurrentRequestCount))
{
status = pTest->Create(poa, size);
pTest->Release();
if (0 <= status)
{
MessageBoxW(0, 0, L"Test...", MB_OK|MB_ICONINFORMATION);
}
}
}
}