问题
I ported a multi-threaded Linux application to Windows and am testing it on a server running Windows 10 Pro. The performance of the Windows version is abysmal compared to the performance of the Linux version running on the same dual-boot hardware. I simplified the code to a small multi-threaded example that exhibits the same symptoms. I am hoping that the SO community can provide some insight as to why there are such performance differences between Windows and Linux for this application, and suggestions on how to remedy the problem.
The machine I'm testing on has dual Intel Xeon Gold 6136 CPUs (24/48 physical/logical cores) @3.0 GHz (Turbo-boost to 3.6 GHz) with 128 GB of memory. The machine is setup to dual-boot CentOS or Windows 10. There is no Windows Hypervisor running (Hyper-V is disabled). NUMA is disabled. In the testing I am performing, each thread should be able to run on a separate core; there are no other processor-consuming applications running.
The application performs complex transformations to convert input data sets of ~15 MB to output data of ~50 MB. I wrote simplified multi-threaded tests (computation only, data movement only, etc) to narrow down the issue. A computation-only test showed no performance differences, but a data-copy scenario did. The repeatable scenario is simply to have each thread copy data from its 15 MB input buffer to its 50 MB output buffer. Each 'int' in the input buffer is written consecutively to the output buffer 3 times. Results from virtually identical Linux and Windows code for 100 iterations with N threads are shown below:
Windows (or cygwin) Linux (native)
Threads Time (msec) Time (msec)
1 4200 3000
2 4020 2300
3 4815 2300
4 6700 2300
5 8900 2300
6 14000 2300
7 16500 2300
8 21000 2300
12 39000 2500
16 75000 3000
24 155000 4000
The times above are the processing time in the worker threads. The results do not include any time for allocating memory or starting the threads. It seems that threads are running independently under Linux but are not under Windows 10.
The full C code I used for Windows testing is here:
//
// Thread test program
//
// To compile for Windows:
// vcvars64.bat
// cl /Ox -o windowsThreadTest windowsThreadTest.c
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <windows.h>
#include <process.h>
#define __func__ __FUNCTION__
//
// Global data
//
HANDLE *threadHandleArray = NULL;
DWORD *threadIdArray = NULL;
//
// Time keeping
//
double *PCFreq = NULL;
__int64 *CounterStart = NULL;
void StartCounter(int whichProcessor)
{
LARGE_INTEGER li;
DWORD_PTR old_mask;
if ( !PCFreq )
{
printf("No freq array\n");
return;
}
if(!QueryPerformanceFrequency(&li))
{
printf("QueryPerformanceFrequency failed!\n");
return;
}
PCFreq[whichProcessor] = ((double)(li.QuadPart))/1000.0;
QueryPerformanceCounter(&li);
CounterStart[whichProcessor] = li.QuadPart;
}
double GetCounter()
{
LARGE_INTEGER li;
DWORD_PTR old_mask;
DWORD whichProcessor;
whichProcessor = GetCurrentProcessorNumber();
if ( CounterStart && CounterStart[whichProcessor] != 0 )
{
QueryPerformanceCounter(&li);
return ((double)(li.QuadPart-CounterStart[whichProcessor]))/PCFreq[whichProcessor];
}
else
return 0.0;
}
typedef struct
{
int retVal;
int instance;
long myTid;
int verbose;
double startTime;
double elapsedTime;
double totalElapsedTime;
struct {
unsigned intsToCopy;
int *inData;
int *outData;
} rwInfo;
} info_t;
int rwtest( unsigned intsToCopy, int *inData, int *outData)
{
unsigned i, j;
//
// Test is simple. For every entry in input array, write 3 entries to output
//
for ( j = i = 0; i < intsToCopy; i++ )
{
outData[j] = inData[i];
outData[j+1] = inData[i];
outData[j+2] = inData[i];
j += 3;
}
return 0;
}
DWORD WINAPI workerProc(LPVOID *workerInfoPtr)
{
info_t *infoPtr = (info_t *)workerInfoPtr;
infoPtr->myTid = GetCurrentThreadId();
double endTime;
BOOL result;
SetThreadPriority(threadHandleArray[infoPtr->instance], THREAD_PRIORITY_HIGHEST);
// record start time
infoPtr->startTime = GetCounter();
// Run the test
infoPtr->retVal = rwtest( infoPtr->rwInfo.intsToCopy, infoPtr->rwInfo.inData, infoPtr->rwInfo.outData );
// end time
endTime = GetCounter();
infoPtr->elapsedTime = endTime - infoPtr->startTime;
if ( infoPtr->verbose )
printf("(%04x): done\n", infoPtr->myTid);
return 0;
}
//
// Main Test Program
//
int main(int argc, char **argv)
{
int i, j, verbose=0, loopLimit;
unsigned size;
unsigned int numThreads;
info_t *w_info = NULL;
int numVirtualCores;
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
if ( argc != 4 )
{
printf("windowsThreadTest <numLoops> <numThreads> <Input size in MB>\n");
return -1;
}
numVirtualCores = sysinfo.dwNumberOfProcessors;
printf("%s: There are %d processors\n", __func__, numVirtualCores);
// Setup Timing
PCFreq = (double *)malloc(numVirtualCores * sizeof(double));
CounterStart = (__int64 *)malloc(numVirtualCores * sizeof(__int64));
if (!PCFreq || !CounterStart)
goto free_and_exit;
for ( i = 0; i < numVirtualCores; i++)
StartCounter(i);
//
// Process input args
//
loopLimit = atoi( argv[1] );
numThreads = atoi( argv[2] );
size = atoi( argv[3] ) * 1024 * 1024;
//
// Setup data array for each thread
//
w_info = (info_t *)malloc( numThreads * sizeof(info_t) );
if ( !w_info )
{
printf("Couldn't allocate w_info of size %zd, numThreads=%d\n", sizeof(info_t), numThreads);
goto free_and_exit;
}
memset( w_info, 0, numThreads * sizeof(info_t) );
//
// Thread Handle Array
//
threadHandleArray = (HANDLE *)malloc( numThreads * sizeof(HANDLE) );
if ( !threadHandleArray )
{
printf("Couldn't allocate handleArray\n");
goto free_and_exit;
}
//
// Thread ID Array
//
threadIdArray = (DWORD *)malloc( numThreads * sizeof(DWORD) );
if ( !threadIdArray )
{
printf("Couldn't allocate IdArray\n");
goto free_and_exit;
}
//
// Run the test
//
printf("Read/write testing... threads %d loops %lu input size %u \n", numThreads, loopLimit, size);
for ( j = 0; j < loopLimit; j++ )
{
//
// Set up the data for the threads
//
for ( i = 0; i < numThreads; i++ )
{
int idx;
int *inData;
int *outData;
unsigned inSize;
unsigned outSize;
inSize = size; // in MB
outSize = size * 3; // in MB
//
// Allocate input buffer
//
inData = (int *) malloc( inSize );
if ( !inData )
{
printf("Error allocating inData of size %zd\n", inSize * sizeof(char));
goto free_and_exit;
}
else
{
if ( verbose )
printf("Allocated inData of size %zd\n", inSize * sizeof(char));
}
//
// Allocate output buffer 3x the size of the input buf
//
outData = (int *) malloc( outSize * 3 );
if ( !outData )
{
printf("Error allocating outData of size %zd\n", outSize * sizeof(char));
goto free_and_exit;
}
else
{
if ( verbose )
printf("Allocated outData of size %zd\n", outSize * sizeof(char));
}
//
// Put some data into input buffer
//
w_info[i].rwInfo.intsToCopy = inSize/sizeof(int);
for ( idx = 0; idx < w_info[i].rwInfo.intsToCopy; idx++)
inData[idx] = idx;
w_info[i].rwInfo.inData = inData;
w_info[i].rwInfo.outData = outData;
w_info[i].verbose = verbose;
w_info[i].instance = i;
w_info[i].retVal = -1;
}
//
// Start the threads
//
for ( i = 0; i < numThreads; i++ )
{
threadHandleArray[i] = CreateThread( NULL, 0, workerProc, &w_info[i], 0, &threadIdArray[i] );
if ( threadHandleArray[i] == NULL )
{
fprintf(stderr, "Error creating thread %d\n", i);
return 1;
}
}
//
// Wait until all threads have terminated.
//
WaitForMultipleObjects( numThreads, threadHandleArray, TRUE, INFINITE );
//
// Check the return values
//
for ( i = 0; i < numThreads; i++ )
{
if ( w_info[i].retVal < 0 )
{
printf("Error return from thread %d\n", i);
goto free_and_exit;
}
if ( verbose )
printf("Thread %d, tid %x %f msec\n", i, (unsigned)w_info[i].myTid, w_info[i].elapsedTime);
w_info[i].totalElapsedTime += w_info[i].elapsedTime;
}
//
// Free up the data from this iteration
//
for ( i = 0; i < numThreads; i++ )
{
free( w_info[i].rwInfo.inData );
free( w_info[i].rwInfo.outData );
CloseHandle( threadHandleArray[i] );
}
}
//
// All done, print out cumulative time spent in worker routine
//
for ( i = 0; i < numThreads; i++ )
{
printf("Thread %d, loops %d %f msec\n", i, j, w_info[i].totalElapsedTime);
}
free_and_exit:
if ( threadHandleArray )
free( threadHandleArray );
if ( threadIdArray )
free( threadIdArray );
if ( PCFreq )
free( PCFreq );
if ( CounterStart )
free( CounterStart );
if ( w_info )
free( w_info );
return 0;
}
The code above was easily changed to utilize pthreads, compiling with the command line 'gcc -O3 -o pthreadTestLinux pthreadTest.c' to obtain the Linux results described above (I can post if necessary). If compiled on Windows with gcc in a cygwin environment, the results mirror those using the Windows sample code.
I've experimented with various BIOS settings, raising the thread priority, pre-allocated thread pools, etc with no change in the performance. I don't think this is a case of false-sharing due to the fact that the Linux version displays radically different performance with virtually identical code. I'm wondering if there is something in how I'm compiling. I am using the 64-bit toolchain.
Any ideas?
回答1:
I've seen similar issues with Cygwin apps on multicore/multiprocessor machines. As far as I know, this is still an unsolved problem in Cygwin.
One thing I noticed, and you can try, is that pinning the process to a single CPU may dramatically improve its performance (but obviously will also limit the ability to take advantage of multicore and multithread parallelism). You can pin the process to a single CPU by using Windows task manager to set the process affinity to just one CPU/core.
If doing so improves the performance of a single thread significantly, then you're seeing the same problem I've noticed. And, I don't believe it's a problem with your code then, but a problem with Cygwin.
回答2:
Was curious to see how the Windows performance for this compared to the Linux performance for the multi-threaded memory transforming issue in golang, so I ported the code to as close to the original as possible and then did a few of the same performance tests on a similar hardware platform.
Unlike the results seen in the posted question, the golang code did not blow up as the number of simultaneous operations increased. The corresponding performance chart is:
Num Threads Time in Process
1 4000
2 4100
4 4200
6 3600
12 3600
16 3800
24 3700
These results are significantly slower than what you show in the C code running on Linux.
Not sure if any of this is helpful, but it looks like there is a general issue with Windows 10 causing multi-threaded performance issues when doing memory some operations, but also there seems to be a correlation with the performance of the C code when compiled by both cl and gcc (cygwin) as you describe in your question.
The golang code is:
package main
import "fmt"
import "os"
import "time"
import "strconv"
func rwtest(intsToCopy int, inData *[]int, outData *[]int) {
var i int
var j int
j = 0
for i=0 ; i<intsToCopy ; i++ {
(*outData)[j + 0] = (*inData)[i]
(*outData)[j + 1] = (*inData)[i]
(*outData)[j + 2] = (*inData)[i]
j += 3
}
}
func workerProc(threadNum int, reportChan chan int, numLoops int, dataSize int) {
var i int
var inData []int
var outData []int
var cumulativeTime time.Duration
cumulativeTime = 0
for i=0 ; i<numLoops ; i++ {
inData = make([]int, dataSize, dataSize)
outData = make([]int, dataSize * 3, dataSize * 3)
startTime := time.Now()
rwtest(dataSize, &inData, &outData)
endTime := time.Now()
cumulativeTime += endTime.Sub(startTime)
inData = nil
outData = nil
}
// Print out the cumulative time
fmt.Printf("Thread %d duration is %d\n", threadNum, cumulativeTime)
// Write out to the channel
reportChan <- 0
}
func main() {
var i int
if len(os.Args) != 4 {
fmt.Printf("Usage: %s <num threads> <num loops> <data size>\n", os.Args[0])
return
}
numThreads, _ := strconv.Atoi(os.Args[1])
numLoops, _ := strconv.Atoi(os.Args[2])
dataSize, _ := strconv.Atoi(os.Args[3])
fmt.Printf("Running Program with %d threads, with %d loops\n", numThreads, numLoops)
// Make a channel for each thread
var chans []chan int
for i=0 ; i<numThreads ; i++ {
chans = append(chans, make(chan int))
}
// start the threads
for i=0 ; i<numThreads ; i++ {
go workerProc(i, chans[i], numLoops, dataSize)
}
var x int
// Loop through the channels, waiting for each go routine to finish
for i=0 ; i<numThreads ; i++ {
x = <-chans[i]
}
fmt.Printf("Done: %d\n", x)
}
回答3:
Youtubers Level 1 Techs was seeing this on Threadripper processors also. Long story short, is Windows 10 kernel seems to be shuffling threads between cores FAR FAR to much while the program is running. https://www.youtube.com/watch?v=M2LOMTpCtLA
I have no idea if this is a problem with Server 2016 or 2019 kernel also. Being a new owner of a Threadripper 2950x myself, I would really like to get this solved.
来源:https://stackoverflow.com/questions/51217320/multi-threading-performance-much-worse-on-windows-10-than-linux