问题
Is there any way for the Linux select() call relay event ordering?
A description of what I'm seeing:
On one machine, I wrote a simple program which sends three multicast packets, one to each of three different multicast groups. These packets are sent back-to-back, with no delay in between. I.e. sendto(mcast_group1); sendto(mcast_group2); sendto(mcast_group3).
On the other machine, I have a receiving program. The program uses one socket per multicast group. Each socket does a bind() and IP_ADD_MEMBERSHIP (i.e. join/subscribe) to the address to which it listens. The program then does a select() on the three sockets.
When select returns, all three sockets are available for reading. But which one came first? The ready-for-reading list of sockets is a set, and therefore has no order. What I would like is if select() returned exactly once per received packet, in order (the increased overhead is acceptable here). Or, is there some other kind of mechanism I can use to determine packet receive order?
Additional information:
- OS is CentOS 5 (effectively Redhat Enterprise Linux) on x86_64
- NIC hardware is an Intel 82571EB
- I've tried e1000e driver versions 1.3.10-k2 and 2.1.4-NAPI
- I've tried pinning the NIC's interrupt to an unloaded and isolated CPU core
- I've disabled hardware IRQ coalescing via setting the driver option InterruptThrottleRate=0, and setting rx-usecs=0 via ethtool
- I also tried using epoll, and it has the same behavior
A final remark: packet ordering is preserved if I only use one socket. In this case, I bind to INADDR_ANY (0.0.0.0) and do the IP_ADD_MEMBERSHIP multiple times on the same socket. But this does not work for our application, because we need the filtering provided by binding to the actual multicast address. Ultimately, there will be multiple multicast receiving programs on the same machine, with subscription sets that may intersect with each other. So maybe an alternate solution is to find another way to achieve the filtering effect of bind(), but without bind().
回答1:
You can use IP_PKTINFO to get the address of the multicast group the packet was send to - even if the socket is subscribed for a bunch of multicast groups. Having this in place, you will get the packets in order and the ability to filter by group addresses. See the example below:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/stat.h>
#include <ctype.h>
#include <errno.h>
#define PORT 1234
#define PPANIC(msg) perror(msg); exit(1);
#define STATS_PATCH 0
int main(int argc, char **argv)
{
fd_set master;
fd_set read_fds;
struct sockaddr_in serveraddr;
int sock;
int opt = 1;
size_t i;
int rc;
char *mcast_groups[] = {
"226.0.0.1",
"226.0.0.2",
NULL
};
#if STATS_PATCH
struct stat stat_buf;
#endif
struct ip_mreq imreq;
FD_ZERO(&master);
FD_ZERO(&read_fds);
rc = sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
if(rc == -1)
{
PPANIC("socket() failed");
}
rc = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
if(rc == -1)
{
PPANIC("setsockopt(reuse) failed");
}
memset(&serveraddr, 0, sizeof(serveraddr));
serveraddr.sin_family = AF_INET;
serveraddr.sin_port = htons(PORT);
serveraddr.sin_addr.s_addr = htonl(INADDR_ANY);
rc = bind(sock, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
if(rc == -1)
{
PPANIC("bind() failed");
}
rc = setsockopt(sock, IPPROTO_IP, IP_PKTINFO, &opt, sizeof(opt));
if(rc == -1)
{
PPANIC("setsockopt(IP_PKTINFO) failed");
}
for (i = 0; mcast_groups[i] != NULL; i++)
{
imreq.imr_multiaddr.s_addr = inet_addr(mcast_groups[i]);
imreq.imr_interface.s_addr = INADDR_ANY;
rc = setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, (const void *)&imreq, sizeof(struct ip_mreq));
if (rc != 0)
{
PPANIC("joing mcast group failed");
}
}
FD_SET(sock, &master);
while(1)
{
read_fds = master;
rc = select(sock + 1, &read_fds, NULL, NULL, NULL);
if (rc == 0)
{
continue;
}
if(rc == -1)
{
PPANIC("select() failed");
}
if(FD_ISSET(sock, &read_fds))
{
char buf[1024];
int inb;
char ctrl_msg_buf[1024];
struct iovec iov[1];
iov[0].iov_base = buf;
iov[0].iov_len = 1024;
struct msghdr msg_hdr = {
.msg_iov = iov,
.msg_iovlen = 1,
.msg_name = NULL,
.msg_namelen = 0,
.msg_control = ctrl_msg_buf,
.msg_controllen = sizeof(ctrl_msg_buf),
};
struct cmsghdr *ctrl_msg_hdr;
inb = recvmsg(sock, &msg_hdr, 0);
if (inb < 0)
{
PPANIC("recvmsg() failed");
}
for (ctrl_msg_hdr = CMSG_FIRSTHDR(&msg_hdr); ctrl_msg_hdr != NULL; ctrl_msg_hdr = CMSG_NXTHDR(&msg_hdr, ctrl_msg_hdr))
{
if (ctrl_msg_hdr->cmsg_level == IPPROTO_IP && ctrl_msg_hdr->cmsg_type == IP_PKTINFO)
{
struct in_pktinfo *pckt_info = (struct in_pktinfo *)CMSG_DATA(ctrl_msg_hdr);
printf("got data for mcast group: %s\n", inet_ntoa(pckt_info->ipi_addr));
break;
}
}
printf("|");
for (i = 0; i < inb; i++)
printf("%c", isprint(buf[i])?buf[i]:'?');
printf("|\n");
#if STATS_PATCH
rc = fstat(sock, &stat_buf);
if (rc == -1)
{
perror("fstat() failed");
} else {
printf("st_atime: %d\n", stat_buf.st_atime);
printf("st_mtime: %d\n", stat_buf.st_mtime);
printf("st_ctime: %d\n", stat_buf.st_ctime);
}
#endif
}
}
return 0;
}
the code below won't solve OPs problem but may guide people dealing with similar requirements
(EDIT) One should not do such things late at night... even with that solution you will only get the order the fd was handled by select - and this will give you no indication about the time of the frame arrival.
As stated here, it is currently not possible to retrieve the order of the sockets or the timestamps they changed as the required callback is not set for socket inodes. But if you are able to patch your kernel, you may work around the problem by setting the time within the select system call.
The following patch may give you an idea:
diff --git a/fs/select.c b/fs/select.c
index 467bb1c..3f2927e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -435,6 +435,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
@@ -452,6 +455,16 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
f = fdget(i);
if (f.file) {
const struct file_operations *f_op;
+ struct kstat stat;
+
+ int ret;
+ u8 is_sock = 0;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if(ret == 0 && S_ISSOCK(stat.mode)) {
+ is_sock = 1;
+ }
+
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
if (f_op->poll) {
@@ -464,16 +477,22 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
res_in |= bit;
retval++;
wait->_qproc = NULL;
+ if(is_sock && f.file->f_inode)
+ f.file->f_inode->i_ctime.tv_sec = tv.tv_sec;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
wait->_qproc = NULL;
+ if(is_sock && f.file->f_inode)
+ f.file->f_inode->i_ctime.tv_sec = tv.tv_sec;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
wait->_qproc = NULL;
+ if(is_sock && f.file->f_inode)
+ f.file->f_inode->i_ctime.tv_sec = tv.tv_sec;
}
/* got something, stop busy polling */
if (retval) {
Notes:
this is... just for you :) - don't expect it in the mainline
do_gettimeofday() is called before each relevant fd is tested. to get higher granularity this should be done in each iteration (and only if needed). since the stat-interface only offers a granularity of one second you may (!UGLY!) use the remaining time attributes to map the fractions of a second to those fields.
this was done using kernel 3.16.0 and is not well tested. don't use it in a space ship or medical equipment. if you would like to try it, get a filesystem-image (eg. https://people.debian.org/~aurel32/qemu/amd64/debian_wheezy_amd64_standard.qcow2) and use qemu to test it:
sudo qemu-system-x86_64 -kernel arch/x86/boot/bzImage -hda debian_wheezy_amd64_standard.qcow2 -append "root=/dev/sda1"
回答2:
If select() returns > 1 the events must have been so close together as to make the question of ordering meaningless.
回答3:
You can obtain the timestamp at which a file descriptor became ready using fstat.
For more info read http://pubs.opengroup.org/onlinepubs/009695399/functions/fstat.html
来源:https://stackoverflow.com/questions/17529223/linux-select-and-fifo-ordering-of-multiple-sockets