UNIX domain datagram socket — server does not receive all records

送分小仙女□ 提交于 2020-06-17 09:40:54

问题


I have a multicore program that uses a UNIX domain datagram socket. Three cores (or more, but 3 in this example) send records with sendto and one core receives records with recvfrom and processes the records sent by multiple clients. Each core is assigned to a separate core (using its affinity mask).

The program processes a data series in a loop and at the end of each iteration the cores send data to the socket. The data series is small 64-bit integers.

My problem is that I receive records only from core 0 on the receive side (the receive side starts at Write_Data_To_Memory_fn near the bottom of the NASM listing).

The code is a shared object written in NASM called by a C wrapper, and the socket handing functions are all C programs that were linked into the NASM shared object. The C socket functions are called from NASM:

#include <sys/un.h>
#include <sys/socket.h>
#include <ctype.h>
#include "tlpi_hdr_sockets.h"
#include "ud_ucase_sockets.h"
#include "error_functions_sockets.h"
#include "get_num_sockets.h"
#include "errno.h"

#define BUF_SIZE 750 /* Max size of messages between client to  server */
#define SV_SOCK_PATH "/tmp/ud_ucase"

int64_t * create_socket_server(struct sockaddr_un svaddr, int64_t   retvals[])
{
    //struct sockaddr_un svaddr2;
    int sfd, j;
    ssize_t numBytes;
    socklen_t len;
    char buf[BUF_SIZE];

    retvals[0] = 0;
    retvals[1] = 0;

    sfd = socket(AF_UNIX, SOCK_DGRAM, 0); /* Create server socket */

    if (sfd == -1)
         return retvals;

    /* Construct well-known address and bind server socket to it */

    if (remove(SV_SOCK_PATH) == -1 && errno != ENOENT)
           return retvals;

    memset(&svaddr, 0, sizeof(struct sockaddr_un));
    svaddr.sun_family = AF_UNIX;
    strncpy(svaddr.sun_path, SV_SOCK_PATH, sizeof(svaddr.sun_path)  - 1);

    if (bind(sfd, (struct sockaddr *) &svaddr, sizeof(struct    sockaddr_un)) == -1)
           return retvals;

    retvals[0] = sfd;
    retvals[1] = (int64_t)&svaddr;

     return retvals;
}

int64_t * create_socket_client(struct sockaddr_un claddr, int64_t   retvals[])
{
    int sfd, j;
    size_t msgLen;
    ssize_t numBytes;
    char resp[BUF_SIZE];

    retvals[0] = 0;
    retvals[1] = 0;

    /* Create client socket; bind to unique pathname (based on PID) */

    sfd = socket(AF_UNIX, SOCK_DGRAM, 0);
    if (sfd == -1)
        return retvals;

    memset(&claddr, 0, sizeof(struct sockaddr_un));
    claddr.sun_family = AF_UNIX;
    snprintf(claddr.sun_path, sizeof(claddr.sun_path),  "/tmp/ud_ucase_cl.%ld", (long) getpid());

    retvals[0] = sfd;
    retvals[1] = (int64_t)&claddr;

     return retvals;
}

ssize_t client_send(int sfd, const void * buf, size_t msgLen, const     struct sockaddr * svaddr)
{

    int64_t alen = sizeof(struct sockaddr_un);

    ssize_t result = sendto(sfd, buf, msgLen, 0, svaddr, alen);

    if (result != msgLen)
        return 0;

    return result;
}

int64_t server_receive(int64_t sfd, void * buf, int64_t msgLen,     const void * claddr)
{
    socklen_t * len = (socklen_t * ) sizeof(struct sockaddr_un);
    int numBytes = recvfrom(sfd, buf, BUF_SIZE, 0, (struct  sockaddr *) claddr, len);

    if (numBytes == -1)
        return 0;

    return numBytes;
}

int close_socket(socket_fd)
{
    int closed = close(socket_fd);
    return closed;
}

Here is a minimal version of the NASM program. This minimal version has only one line for data calculation (VCVTTPD2QQ zmm0,[r11+r15]) -- that reads 8 qwords into zmm0. At label_805 zmm0 is transferred to a memory buffer to send to the core. The memory buffer is preallocated with space for all three calculation cores, and we write to it using an offset for the core number.

At label Check_Records near the bottom of the code listing we receive records and write them to a file. I removed most of that part except for code that writes each record to a file on receipt from the socket. In my tests, the server receives only records from core 0, not from cores 1 and 2, and that's the heart of my problem.

; Header Section
[BITS 64]

[default rel]

global Main_Entry_fn
global While_Loop_Test_fn
global Write_Data_To_Memory_fn
global Process_Data
global FreeMem_fn
extern sched_getcpu, syscall
extern malloc, calloc, realloc, free
extern memcpy, fopen, fread, fwrite, fclose
extern thread_create_in_C_FC
extern get_core_count_C
extern create_socket_server, create_socket_client
extern client_send, server_receive, close_socket

section .data align=16
data_master_ptr: dq 0
Return_Pointer_Array: dq 0, 0, 0, 0, 0, 0
initial_dynamic_length: dq 0
XMM_Stack: dq 0, 0, 0, 0, 0, 0, 0
X_ptr: dq 0
X_length: dq 0
collect_ptr: dq 0
collect_length: dq 0
collect_ctr: dq 0
Number_Of_Cores: dq 0 ; core count multiplied by 8 (e.g. 32)
Number_Of_Cores_Seq: dq 0 ;sequential core count (e.g. 4 cores)
Number_Of_Cores_Open: dq 0
Number_Of_Cores_Data: dq 0 ; 1 core is for writing data, not    processing
bytes_written: dq 0
stride: dq 64
out_fname_L: db     "/opt/Test_Output_Files/While_Loops_02_NASM_4MB_Test",0x00
file_mode_create: db "wb+",0x00
file_mode_open: db "a",0x00
file_mode_open_read: db "r",0x00
ALIGN 8
out_fname_ptr: dq 0
ALIGN 16

struc sockaddr_cl
   .sun_family resb 7 ;AF_UNIX
   .sun_path resb 108 ;Pathname
endstruc

sockaddr_cl_attributes:
  struc_cli istruc sockaddr_cl
    at sockaddr_cl.sun_family, db "AF_UNIX"
    at sockaddr_cl.sun_path, db "/tmp/ud_ucase"
  iend

struc sockaddr_sv
   .sun_family resb 7 ;AF_UNIX
   .sun_path resb 108 ;Pathname
endstruc

sockaddr_sv_attributes:
  struc_svr istruc sockaddr_sv
    at sockaddr_sv.sun_family, db "AF_UNIX"
    at sockaddr_sv.sun_path, db "/tmp/ud_ucase"
  iend

ALIGN 16
socket_receive_code: dq 0
wait_buffer_ptr: dq 0
wait_buffer_ctr: dq 0
wait_buffer_size: dq 0
wait_buffer_items: dq 0
wait_next_avail: dq 0
wait_buffer_bitmask: dq 0

memory_pointers_ptr: dq 0

where_from: dq 0

socket_client_fd: dq 0
socket_server_fd: dq 0
socket_buffer: times 90 dq 0
socket_client_addr: dq 0
socket_server_addr: dq 0
retvals_cli: times 2 dq 0
retvals_svr: times 2 dq 0

section .bss
close_record: resq 90 ; see label_899

; __________

section .text

Init_Cores_fn:

; __________

; open output file
lea rdi,[out_fname_L]
lea rsi,[file_mode_create]
call [rel fopen wrt ..got]
mov rdi,rax
call [rel fclose wrt ..got]

; reopen output file in append mode
lea rdi,[out_fname_L]
lea rsi,[file_mode_open]
call [rel fopen wrt ..got]
mov [out_fname_ptr],rax

; __________

get_cores:

mov rdi,1
call [rel get_core_count_C wrt ..got]

mov rax,4 ; TEMPORARILY HARDWIRED TO 4 CORES

mov [Number_Of_Cores_Seq],rax
mov [Number_Of_Cores_Open],rax
mov rbx,8
mul rbx
mov [Number_Of_Cores],rax

; Set up shared memory buffers for commo with writer core (core 2)
%include    "/opt/P01_SH/_Include_Utilities/Malloc_for_VarLenOut_Minimal.asm"

; Calculate stride based on # of cores
mov rax,64
mov rbx,[Number_Of_Cores_Seq]
sub rbx,1 ; Don't count the writer core
mul rbx
mov [stride],rax

; _____
; Create Threads

label_first:

mov rdi,[Number_Of_Cores_Seq] ; see above
call [rel thread_create_in_C_FC wrt ..got]

; _________

jmp label_900 ; All threads return here, and exit

; ______________________________________

While_Loop_Test_fn:

mov rbp,rsp
sub rsp,96

; Get the core number
call [rel sched_getcpu wrt ..got]
sub rax,1 ; Core 0 is the write core so all other cores are     decremented
mov rbx,8 ; multiply by 8
mul rbx
mov [rbp-40],rax

; Open the socket client
Open_Socket_Client:
lea rdi,[struc_cli]
lea rsi,[retvals_cli]
sub rsp,480
call [rel create_socket_client wrt ..got]
add rsp,480
mov rdi,rax
mov rax,[rdi]
mov [socket_client_fd],rax
mov rax,[rdi+8]
mov [socket_client_addr],rax

; _____
; Transfer the other points to the stack
; See Malloc_for_VarLenOut.asm include file

memory_ptrs:
mov rax,[rbp-40] ; Core#
mov rbx,3 ; 3 pointers per core
mul rbx
mov r15,rax ; Core offset in memory_pointers_ptr

mov rdi,[memory_pointers_ptr]

;v8_v_check:
mov rax,[rdi+r15+0]
mov [rbp-24],rax

; __________

mov rax,[rbp-40] ; core #
mov rbx,8 ; core number x 8 = 0,64,128,192,etc
mul rbx
mov r15,rax ; while_counter (set per core)

; _________
; Populate registers

vzeroall
mov r11,[X_ptr]
mov r10,[X_length]

;______
; while while_counter < X_length

label_4010:

;____________

label_401:
cmp r15,r10
jge label_899

cmp r15,1920
jg label_899

;______

first_data_read:

VCVTTPD2QQ zmm0,[r11+r15]

;______

label_805:

mov rsi,[rbp-24]
vmovdqu64 [rsi],zmm0
mov rax,[rbp-40]
mov [rsi+704],rax
mov [rsi+712],r15

push rcx
push r10
push r11
mov edi,[socket_client_fd]
mov rsi,[rbp-24]
mov rdx,720
mov rcx,[socket_server_addr]
call [rel client_send wrt ..got]
pop r11
pop r10
pop rcx

;______
; CODE:           while_counter += 1

while_block_exit_0:
add r15,[stride]
jmp label_401

;__________

label_899:

; send exit message to waiting (writer) core
mov edi,[socket_client_fd]
lea rsi,[close_record]
mov rax,[rsi]
mov rax,[rsi+8]
mov rax,999
mov [rsi+704],rax
mov rdx,720
mov rcx,[socket_server_addr]
push r11
push rcx
call [rel client_send wrt ..got]
pop rcx
pop r11

add rsp,96 ; unwind the stack

msg_queue_z:

ret

; __________

label_900:

mov rdi,[collect_ptr]
mov rax,[rdi]

mov rdi,[out_fname_ptr]
call [rel fclose wrt ..got]

lea rdi,[Return_Pointer_Array]
mov rax,234234
mov [rdi],rax ;rbp
mov rbp,0
mov [rdi+8],rbp
mov rax,rdi

ret

; __________
; Main Entry

Main_Entry_fn:

push rdi
push rbp
push rbx

mov [X_ptr],rdi
mov [data_master_ptr],rsi
; Now assign lengths
Assign_lengths:
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [X_length],rax
add rcx,8
; __________
; malloc for dynamic arrays

lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov r8,rax
;Allocate 10 times size of input array
;but use the size of input array if it exceeds 10MB
mov rdx,10
mul rdx
mov rdx,10000000
cmp rax,rdx
jl malloc_next
mov rax,r8
malloc_next:
mov rax,200000000 ; hardwired
mov [initial_dynamic_length],rax
;__________

mov rdi,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call [rel malloc wrt ..got]
mov qword [collect_ptr],rax
add rsp,40
mov [collect_ptr],rax
mov rax,qword[initial_dynamic_length]
mov [collect_length],rax
; __________

call Init_Cores_fn

pop rbx
pop rbp
pop rdi
ret
;____________________

Write_Data_To_Memory_fn:

;r8  - where_from
;r9  - AVAILABLE (was next_core_number)
;r10 - next_iter_number
;r11 - wait_buffer_ptr
;r12 - wait_buffer_bitmask
;r13 - collect_ctr
;r14 - collect_ptr
;r15 - wait_buffer_items *****

; xmm31 - source pointer for write_data

; Allocate the "wait buffer" to hold records that are not next core 
; and iter in sequence.  The buffer is 90×20×8 for 20 records 8     bytes per 
; element and 90 items per record = 28,800 bytes can fit into L1D.  
mov rdi,5760 ;1800 ; items (64 rows)
mov rsi,8 ; size of each item in bytes
xor rax,rax 
sub rsp,40
call [rel calloc wrt ..got]
add rsp,40
mov r11,rax ; wait_buffer_ptr
mov rax,704
mov [wait_buffer_ctr],rax ; offset to core#

mov r14,[collect_ptr]
xor r13,r13 ; collect_ctr
xor r10,r10
xor r12,r12

mov rax,255
kmovq k7,rax

Open_Socket_Server:
; Open the Socket Server
lea rdi,[struc_svr] ;mov rdi,struc_svr
lea rsi,[retvals_svr]
push r10
push r11
sub rsp,480
call [rel create_socket_server wrt ..got]
add rsp,480
mov rdi,rax
mov rax,[rdi]
mov [socket_server_fd],rax
mov rax,[rdi+8]
mov [socket_server_addr],rax
pop r11
pop r10

; _________
; This the main record check loop for thread 2

mov rdi,[socket_server_fd]
lea rsi,[socket_buffer]

xor r8,r8 ;[where_from]
xor r15,r15 ;[wait_buffer_items]

; _________

Check_socket_receive:

;int64_t server_receive(int sfd, const void * buf, int msgLen, (struct sockaddr *) &claddr)
mov rdi,[socket_server_fd]
lea rsi,[socket_buffer]
mov rdx,720
push r10
push r11
push r12
mov rcx,[socket_client_addr]
sub rsp,480
call [rel server_receive wrt ..got]
add rsp,480
pop r12
pop r11
pop r10

; __________
; Process the records here - process until we've gone
; through all records, the loop back to Check_socket_receive

Check_Records:

; If it's next record up, process it
Check_Records_Next:
mov rax,[rsi+704] ; Core# (used only for this)
cmp rax,999 ; TEMPORARY; IN PRODUCTION WE PROCESS THE WAIT QUEUE    FIRST
je msg_queue_z ; TEMPORARY; IN PRODUCTION WE PROCESS THE WAIT QUEUE     FIRST

push rsi
mov rdi,rsi
mov rsi,8
mov rcx,[out_fname_ptr]
mov rdx,90
call [rel fwrite wrt ..got]
pop rsi
jmp Check_socket_receive
ret

I call this using an input array of 64-bit integers, from a C wrapper:

#include <dlfcn.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

typedef double* (*shared_object_ptr)(double *X, double  *Length_Array_Out);

int64_t* Read_Data ( char * file_name );

// _______________

int main(int argc, const char* argv[])
{
    char* file_name;
    double* file_01_buf;
    int64_t* file_info; // from Read_Data
    double* file_ptr;
    int64_t file_length; //double* file_length;
    double length_array_out[3];
    double *length_array_ptr = length_array_out;

    // Timers
    clock_t start, end;
    double cpu_time_used;

    // Read the data into buffer(s)
    file_name = "/opt/Test_Data/Random_4MB.bin";
    file_info = Read_Data ( file_name );

    file_ptr = (double *)file_info[0];
    file_length = file_info[1];

    length_array_out[0] = file_length;

    printf ("%ld\n", file_info[0]); //24; with return type as   below: 140139929620496
    printf ("%ld\n", file_info[1]); //4000000

    // _______________
    // Open the shared object

    void* libhandle =   dlopen("/opt/P01_SH/While_Loops_02/While_Loop_List_Comp.so", RTLD_NOW);

    if (!libhandle) {
        fprintf(stderr, "dlopen error: %s\n", dlerror());
        exit(1);
    }

    printf("dlopen success: handle %p\n", libhandle);

    shared_object_ptr WL_02_fn = dlsym(libhandle, "Main_Entry_fn");

    char* err = dlerror();

    if (err) {
        fprintf(stderr, "dlsym failed: %s\n", err);
        exit(1);
    }

    printf("Calling the shared object\n");
    double* ret_ptr; //int64_t ret_ptr;

    start = clock();
    ret_ptr = WL_02_fn(file_ptr, length_array_ptr);
    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
    printf("fn() took %f seconds to execute \n", cpu_time_used); 

    int64_t rp = ret_ptr[0];
    int64_t length = (int64_t)ret_ptr[1];

}

// _________________
// Read the data
// Returns a 2-element integer array:  (1) pointer (2) length

int64_t* Read_Data ( char * file_name ) 
{

    FILE *file;
    unsigned long fileLen;
    static int64_t return_array[2];

    // Open file
    file = fopen(file_name, "rb");

    if (!file)
    {
        fprintf(stderr, "Unable to open file %s", file_name);
        return 0;
    }

    //Get file length
    fseek(file, 0, SEEK_END);
    fileLen = ftell(file);
    fseek(file, 0, SEEK_SET);

    //Allocate memory
    double *file_buffer = (double *)malloc(fileLen);

    if (!file_buffer)
    {
        fprintf(stderr, "Memory error!");
        fclose(file);
        return 0;
    }

    //Read file contents into buffer
    fread(file_buffer, fileLen, 1, file);
    fclose(file);

    return_array[0] = (int64_t)file_buffer; //*file_buffer;
    return_array[1] = fileLen;

    return return_array;
}

When I run the code without breakpoints (using gdb) the program hangs. If I set a breakpoint at Check_Records, even for part of the run, then it works all the way through. That suggests the need for a synchronization primitive, but that defeats the purpose of using the socket.

I suspect that I may need separate file descriptors for each core, but I haven't seen any resources that say I need to do that -- recvfrom includes socket_client_addr as a parameter, so if I had separate file descriptors for each core then I would have to call recvfrom in a loop.

Even this minimal example is a lot of code, but the simple question is why I get records only from core 0 on the receive side of the socket. The send side sends records from all cores.

Thanks for any help to find out why the server receives only records from core 0.

来源:https://stackoverflow.com/questions/61903585/unix-domain-datagram-socket-server-does-not-receive-all-records

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!