问题
I'll first do some contextualization. I'm trying to implement a non-blocking work stealing method using deques in CUDA. The deques (aDeques) are in a block-segmented array in global memory and the popWork() device function has the objective of popping work to feed threads. In addition of the global deques, each block has a stack in shared memory (aLocalStack) where it can locally work. The pop occurs in 3 levels. First attempt is in the shared stack, second attempt is in the deque owned by the block and third attempt is work steal other deques. Each deque has global bottom and pop pointers that lie in a global memory arrays (aiDequesBottoms and auiDequesAges). My problem is that when a block changes a global deque pointer, the changes aaren't being visible by other blocks when I test code in a GTS450. It seems like cache is not being updated. I have also tested in a GT520 card, where the problem does not occur. I have experienced similar problems with the aiDequeFlags array. These problems are solved by declaring it volatile. Unfortunatelly, I can't do the same to the deque pointer arrays, since I need to use atomic functions on them later. I'm sorry to not put the problem in a simpler example, but I couldn't reproduce this behavior. This first snippet has the popWork() interface explained .
template <int iDequeSize> //Size of each segment in aDeques
bool __inline__ __device__ popWork(
volatile int *aiDequeFlags , //Flags that indicates if a deque is active (has work)
int *aiDequesBottoms , //Deque bottom pointers
unsigned int *auiDequesAges , //Deque top pointers (29 higher bits) +
//Tag bits(3 lower bits).
const Int2Array *aDeques , //Deques (Int2Array is an interface for 2 int arrays)
int &uiStackBot , //Shared memory stack pointer
int2 *aLocalStack , //Shared memory local stack
const int &iTid , //threadIdx.x
const int &iBid , //blockIdx.x
//All other parameters are output
unsigned int &uiPopDequeIdx , //Choosen deque for pop
int2 *popStartIdxAndSize , //Arrays of pop start index and sizes
bool *bPopFlag , //Array of flags for pop in each level
unsigned int &uiActiveDequesIdx , //Flag to indicate pop failed (no more work)
int2 &work //Actual acquired thread work)
This second snippet has the entire function. The kernel that uses the function was launched with 8 blocks, 64 threads and in the beginning just deque 0 has 1 work, while all other deques are empty. There are some debug printf calls to generate a log, which will be show in the next snippet.
template <int iDequeSize>
bool __inline__ __device__ popWork(volatile int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Int2Array *aDeques , int &uiStackBot , int2 *aLocalStack , const int &iTid , const int &iBid ,
unsigned int &uiPopDequeIdx , int2 *popStartIdxAndSize , bool *bPopFlag , unsigned int &uiActiveDequesIdx , int2 &work)
{
//Pop from local stack
if(iTid == 0)
{
unsigned int uiAge = 0;
bPopFlag[0] = popBottom(uiStackBot , uiAge , popStartIdxAndSize[iBid]);
bPopFlag[3] = bPopFlag[0];
}
__syncthreads();
if(bPopFlag[0])
{
if(iTid < popStartIdxAndSize[iBid].y)
{
work = aLocalStack[popStartIdxAndSize[iBid].x + iTid];
}
}
else
{
if(iTid == 0)
{ //Try to pop from block deque
bPopFlag[1] = popBottom(aiDequesBottoms[iBid] , auiDequesAges[iBid] , popStartIdxAndSize[iBid]);
if(bPopFlag[1])
{
uiPopDequeIdx = iBid;
//Debug
if(iBid == 0)
{
printf("Block %d pop global deque. Bottom=%d\n" , iBid , aiDequesBottoms[iBid]);
}
//
}
else
{
aiDequeFlags[iBid] = 0;
popStartIdxAndSize[iBid].x = INFTY;
uiPopDequeIdx = INFTY;
}
bPopFlag[3] = bPopFlag[1];
bPopFlag[2] = false;
}
__syncthreads();
if(!bPopFlag[1])
{
//Verify if lazy steal can be done.
if(iTid < NDEQUES)
{
if(popStartIdxAndSize[iTid].x != INFTY && iTid != iBid)
{
atomicMin(&uiPopDequeIdx , iTid);
bPopFlag[2] = true;
bPopFlag[3] = true;
}
}
__syncthreads();
if(iTid == uiPopDequeIdx)
{
popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
popStartIdxAndSize[iTid].x = INFTY;
}
while(!bPopFlag[3])
{ //No more work, try to steal some!
__syncthreads();
if(iTid == 0)
{
uiActiveDequesIdx = 0;
}
__syncthreads();
if(iTid < NDEQUES)
{
if(aiDequeFlags[iTid] == 1)
{
uiActiveDequesIdx = 1;
//Debug
printf("Block %d steal attempt on block %d. Victim bottom=%d\n" , blockIdx.x , threadIdx.x , aiDequesBottoms[iTid]);
//
if(popTop(aiDequesBottoms , auiDequesAges , iTid , popStartIdxAndSize[iTid]))
{
aiDequeFlags[iBid] = 1;
atomicMin(&uiPopDequeIdx , iTid);
bPopFlag[3] = true;
//Debug
//printf("%d ss %d %d %d\n" , iBid , iTid , popStartIdxAndSize[iTid].x , popStartIdxAndSize[iTid].y);
//
}
}
}
__syncthreads();
if(uiActiveDequesIdx == 0)
{ //No more work to steal. End.
break;
}
if(iTid == uiPopDequeIdx)
{
popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
popStartIdxAndSize[iTid].x = INFTY;
}
__syncthreads();
}
}
__syncthreads();
if(bPopFlag[3] && iTid < popStartIdxAndSize[iBid].y) //assuming number of threads >= WORK_SIZE
{
aDeques->getElement(work , uiPopDequeIdx*iDequeSize + popStartIdxAndSize[iBid].x + iTid);
}
}
return bPopFlag[3];
}
This last snippet is the generated log. The push lines ("Block X push. Bottom=Y") were generated by a push function which was not showed here. Remember that in the beginning, just block 0 has 1 work.
Block 0 pop global deque. Bottom=0
Block 4 steal attempt on block 0. Victim bottom=0
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 4 steal attempt on block 0. Victim bottom=0
Block 7 steal attempt on block 0. Victim bottom=1
Block 0 push. Bottom=448
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 4. Victim bottom=0
Block 1 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 4. Victim bottom=0
Block 5 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 4. Victim bottom=0
Block 4 push. Bottom=384
As can be seen, only block 4 can see the changes in block 0 deque bottom pointer. I tried adding some __threadfence() calls after any change in the pointers but no sucess. Thanks for the attention!
回答1:
It seems based on the comments that the only working solution is to turn off L1 caching. This can be accomplished on a program-wide basis by passing the following switch to nvcc when compiling:
–Xptxas –dlcm=cg
The L1 caches are a property/resource of the SM, not the device as a whole. Since threadblocks execute on specific SMs, the activity of one threadblock in its L1 cache can be incoherent from the activity of another threadblock and its L1 cache (assuming it happens to be running on a different SM), even though they are both referencing the same locations in global memory. L1 caches in different SMs have no connection with each other and are not guaranteed to be coherent with each other.
Note that the L2 cache is device-wide and therefore "coherent" from the perspective of individual threadblocks. Turning off L1 caching has no effect on L2 caching, so there is still the possibility of some caching benefit, however the time required to satisfy a request out of L2 is longer than the time required to satisfy a request out of L1, so turning off L1 caching program-wide is a pretty large hammer to try to get things working.
The volatile
keyword in front of a variable definition should have the effect of telling the compiler to skip L1 caching on loads (according to my understanding). But volatile by itself doesn't address the write path, so it's possible for one threadblock in one SM to do a volatile
read, pulling a value out of L2, modify that value, and then write it back, where it ends up in L1 (until it is evicted). If another threadblock reads the same global value, it may not see the effect of the update.
Diligent use of __threadfence() while tedious, should force any such updates out of L1 into L2, so that other threadblocks can read them. However this still leaves a synchronization gap from when the value was written to when it is observable by other SMs/threadblocks.
(Global) Atomics should also have the effect of going directly to "global memory" to read and write the values used.
It may be instructive to also go through the code to ensure that every possible read from a globally synchronized location is handled properly (e.g. with volatile
or using atomics) and that every possible write to a globally synchronized location is handled properly (e.g. with __threadfence()
or atomics), and also check for race conditions between different blocks.
As discovered, the process of creating a stable globally-synchronized environment within the GPU is non-trivial. These other questions may also be of interest (e.g. with respect to Kepler) (and e.g. discussing global semaphores).
Edit: To respond to a question posted in the comments, I would say this:
Perhaps there's no issue. However __threadfence()
provides no guarantee (that I know of) for a maximum completion time. Therefore at the moment an update is made to a global location, only the L1 associated with the executing threadblock/SM gets updated. Then we hit the __threadfence()
. Presumably threadfence takes some time to complete, and during this time another threadblock could be resident on the same SM, brought in for execution (while the previous thread/warp/block is stalled at the threadfence), and "see" the updated global value in the (local) L1 associated with that SM. Other threadblocks executing in other SMs will see the "stale" value until the __threadfence()
completes. This is what I am referring to as a possible "synchronization gap". Two different blocks can still see two different values, for a brief period of time. Whether this matters or not will be dependent on how the global value is being used for synchronization between blocks (since that is the topic under discussion.) Therefore atomics + volatile may be a better choice than volatile + threadfence, to try and cover both read and write paths for synchronization.
Edit #2: It seems from the comments that the comination of the use of atomics plus volatile
also solved the problem.
回答2:
Frankly, I find your code overly complicated with indeces and - more importantly - incomplete. How do popBottom
and popTop
function? Moreover, how is the push
operation implemented? Those two have to be carefully crafted in order to work correctly and ensure that some synchronization problems do not occur.
For example: what will happen when one block tries to push something to its global-memory queue, while another block tries to read from it at the same very moment? This is very important and if it is not done right, it can crash in some very rare circumstances, e.g. you may pop from data cell which was not written to yet.
When I was implementing a similar thing - a single global-memory deuque shared between all blocks, I was additionally marking each data cell as: empty, occupied and dead. In pseudocode the algorithm worked more-or-less like this:
/* Objects of this class should reside in CUDA global memory */
template <typename T, size_t size>
class WorkQueue {
private:
size_t head, tail;
size_t status[size];
T data[size];
enum {
FieldFree = 0,
FieldDead = 1,
FieldTaken = 2
};
public:
/*
This construction should actually be done by host on the device,
before the actual kernel using it is launched!
Zeroing the memory should suffice.
*/
WorkQueue() : head(0), tail(0) {
for (size_t i=0; i<size; ++i)
status[i]=FieldFree;
}
__device__ bool isEmpty() { return head==tail; }
/* single thread of a block should call this */
__device__ bool push(const T& val) {
size_t oldFieldStatus;
do {
size_t cell = atomicInc(&tail,size-1);
data[cell]=val;
__threadfence(); //wait untill all blocks see the above change
oldFieldStatus=atomicCAS(&status[cell],FieldFree,FieldTaken); //mark the cell as occupied
} while (oldFieldStatus!=FieldFree);
return true;
}
/* single thread of a block should call this */
__device__ bool pop(T& out) {
size_t cellStatus;
size_t cell;
do {
cell=atomicInc(&head,size-1);
cellStatus=atomicCAS(&status[cell],FieldFree,FieldDead);
//If cell was free, make it dead - any data stored there will not be processed. Ever.
} while (cellStatus==FieldDead);
if (cellStatus!=FieldTaken)
return false;
out = data[cell];
status[cell]=FieldFree;
return true;
}
};
I do not see a reliable way of implementing it without the cell status - otherwise bad things will happen if two threads from two different blocks try to push/pop into the same cell of dequeue. With the approach above, the worst case can happen that popping thread will fail to pop, return false and mark the cell as dead
, and the pushing thread will retry pushing into next cell.
The idea behind is, that if the popping thread fails to pop, then there is probably not much work to do anyway and the block can be terminating. With that approach you will "kill" only as many cells as there are blocks running in parallel.
Note, in the above code I do not check for overflow!
来源:https://stackoverflow.com/questions/14484739/how-can-i-enforce-cuda-global-memory-coherence-without-declaring-pointer-as-vola