From: Mihail Dakov <mihail.dakov@ng4t.com>
To: linux-net@vger.kernel.org, netdev@vger.kernel.org
Subject: AF_PACKET: tx_ring mirrored in rx_ring?
Date: Mon, 21 Jul 2014 15:18:30 +0200 [thread overview]
Message-ID: <53CD1326.5090006@ng4t.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1658 bytes --]
Hello guys,
I am having a trouble using the RX/TX ring buffer for AF_PACKET sockets.
I create two sockets (one for rx, one for tx). I bind those sockets to
the same interface. According the docs you can create a socket per
direction or single socket for both directions (allocating double the
memory needed for a ring buffer, and then mapping first rx and then tx
buffer). In this case I opted for creating two sockets, one per
direction. The problem is that when I use the tx_ring to send over the
pf_socket I see those message "mirrored" in the rx_ring buffer which is
not an expected behavior for my application. In other to reproduce the
issue I simplified my application into a smaller one. Then I send a
manually created ping message with adjusted mac and ip address so that a
remote machine in my local network answers it. I successfully see the
ping request double (once in the tx_ring and once in the rx_ring). Which
I think is not expected behavior. This application was tested on kernel
3.14.12-1 and was compiled with gcc (Debian 4.8.3-5) and on kernel
3.2.0-52-lowlatency with compiler gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3.
So some questions have arised:
1. Is this normal behavior? If it is, why? I mean, if I use a socket per
direction I expect to see only packets for that direction on the
correspondent socket, right?
2. Could you provide some more insights about why this "problem" is
happening? Am I doing it wrong? Did I get it wrong (the whole ring
buffer in af_packets)? Am I using wrong settings?
I have attached the simple program which should reproduce the issue.
--
Mihail Dakov
mihail.dakov@ng4t.com
[-- Attachment #2: pftest.cpp --]
[-- Type: text/x-c++src, Size: 9584 bytes --]
#include <cstdio>
#include <cstdint>
#include <cstring>
#include <cstdlib>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
#include <netinet/ip_icmp.h>
#include <net/if.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/sockios.h>
#include <errno.h>
#include <signal.h>
#define BLOCK_SZ (4096 << 8)
#define FRAME_SZ 2048
#define IP_HLEN 20
struct ring3_t
{
uint8_t *rx_buf;
uint32_t brx;//current block idx
struct tpacket_req3 req;
ring3_t()
{
rx_buf = NULL;
brx = 0;
}
};
struct ring_t
{
uint8_t *rx_buf;
uint8_t *tx_buf;
uint32_t ftx;//current frame idx for tx
uint32_t frx;//current frame idx for rx
struct tpacket_req req;
ring_t()
{
rx_buf = tx_buf = NULL;
ftx = frx = 0;
}
};
static int rx_kernel_ready(struct tpacket_hdr_v1 *hdr)
{
return (hdr->block_status & TP_STATUS_USER);
}
static void rx_user_ready(struct tpacket_hdr_v1 *hdr)
{
hdr->block_status = TP_STATUS_KERNEL;
}
static int tx_kernel_ready(struct tpacket2_hdr *hdr)
{
return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
}
static void tx_user_ready(struct tpacket2_hdr *hdr)
{
hdr->tp_status = TP_STATUS_SEND_REQUEST;
}
void filltxring(int sock, uint32_t *frame, ring_t *ring, uint8_t *data, uint32_t len);
uint32_t seq = 0, frametx = 0, flushneed = 0;
int sockrx,socktx, rbuf = 16777216, sbuf = 16777216;
ring_t txring;
void signal_handler(int signum)
{
switch(signum)
{
case SIGHUP:
{
uint8_t data[128];
uint8_t const ping[] = {
0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xBB,0xBB,0xBB,0xBB,0xBB,0xBB,0x08,0x00,0x45,0x00,
0x00,0x54,0xb3,0x31,0x40,0x00,0x40,0x01,0x9f,0x18,0xCC,0xCC,0xCC,0xCC,0xDD,0xDD,
0xDD,0xDD,0x08,0x00,0x71,0xae,0x02,0x35,0x00,0x01,0xed,0xda,0xcc,0x53,0x00,0x00,
0x00,0x00,0x00,0x1a,0x0b,0x00,0x00,0x00,0x00,0x00,0x10,0x11,0x12,0x13,0x14,0x15,
0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,
0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,
0x36,0x37
};
std::memmove(data,ping,98);
filltxring(socktx, &frametx, &txring, data, 98);
break;
}
default:
break;
}
}
void flushtx(int sock)
{
if (flushneed)
{
if (sendto(sock, NULL, 0, MSG_DONTWAIT, NULL, 0) < 0)
fprintf(stderr, "flushtx: sendto() error %s\n",strerror(errno));
flushneed = 0;
}
}
void filltxring(int sock, uint32_t *frame, ring_t *ring, uint8_t *data, uint32_t len)
{
struct tpacket2_hdr *hdr = NULL;
uint8_t *buf = NULL,
*base = (uint8_t*)(ring->tx_buf+(*frame)*FRAME_SZ);
hdr = (struct tpacket2_hdr *)base;
if (tx_kernel_ready(hdr))
{
buf = base+(TPACKET2_HDRLEN-sizeof(struct sockaddr_ll));
std::memmove(buf,data,len);
struct ethhdr *ethh = (struct ethhdr*)buf;
uint8_t *smac = (uint8_t*)ethh->h_source;
uint8_t *dmac = (uint8_t*)ethh->h_dest;
struct iphdr *iph = (struct iphdr*)&buf[ETH_HLEN];
fprintf(stderr,"ftx:%d,len:%d################"
"smac=%02x:%02x:%02x:%02x:%02x:%02x,"
"dmac=%02x:%02x:%02x:%02x:%02x:%02x,"
"sa:%08x,da:%08x\n",
*frame,len,
smac[0],smac[1],smac[2],smac[3],smac[4],smac[5],
dmac[0],dmac[1],dmac[2],dmac[3],dmac[4],dmac[5],
iph->saddr,iph->daddr);
hdr->tp_len = len;
hdr->tp_snaplen = len;
tx_user_ready(hdr);
flushneed = 1;
//next frame
*frame = ((*frame) + 1) % ring->req.tp_frame_nr;
}
}
void walkrxring(int sock, int *block, ring3_t *ring)
{
while (1)
{
struct tpacket_block_desc *bd = NULL;
struct tpacket3_hdr *hdr = NULL;
uint8_t *data = NULL;
bd = (struct tpacket_block_desc*)(ring->rx_buf+ (*block)*BLOCK_SZ);
if (rx_kernel_ready(&bd->hdr.bh1))
{
hdr = (struct tpacket3_hdr*)((uint8_t*)bd+bd->hdr.bh1.offset_to_first_pkt);
for (uint32_t p=0;p<bd->hdr.bh1.num_pkts;p++)
{
data = (uint8_t*)hdr+hdr->tp_mac;
if (hdr->tp_snaplen < FRAME_SZ)//only packet <
{
struct ethhdr *ethh = (struct ethhdr*)data;
uint8_t *smac = (uint8_t*)ethh->h_source;
uint8_t *dmac = (uint8_t*)ethh->h_dest;
struct iphdr *iph = (struct iphdr*)&data[ETH_HLEN];
struct udphdr *udph = (struct udphdr*)&data[ETH_HLEN+IP_HLEN];
fprintf(stderr,"p:%d,len:%d,nump:%d,blk:%d###"
"smac=%02x:%02x:%02x:%02x:%02x:%02x,"
"dmac=%02x:%02x:%02x:%02x:%02x:%02x,"
"sa:%08x,da:%08x,sp:%u,dp:%u\n",
p,hdr->tp_snaplen,bd->hdr.bh1.num_pkts,*block,
smac[0],smac[1],smac[2],smac[3],smac[4],smac[5],
dmac[0],dmac[1],dmac[2],dmac[3],dmac[4],dmac[5],
iph->saddr,iph->daddr,
ntohs(udph->source),ntohs(udph->dest));
}
hdr = (struct tpacket3_hdr*)((uint8_t*)hdr+hdr->tp_next_offset);
}
rx_user_ready(&bd->hdr.bh1);
//next block
*block = ((*block) + 1) % ring->req.tp_block_nr;
} else {
return;//
}
}
}
int pfsocket(int protocol,
int version,
bool trans,
struct ifreq *req,
struct sockaddr_ll *addr,
char *devname,
int rsize,
int ssize)
{
int sock, discardoff = 1;
if (trans)
sock = socket(AF_PACKET, SOCK_RAW, 0);//Only TX
else
sock = socket(AF_PACKET, SOCK_RAW, htons(protocol));
if (sock < 0)
return -1;
std::strncpy(req->ifr_ifrn.ifrn_name, devname, IFNAMSIZ);
if (ioctl(sock, SIOGIFINDEX, req) < 0)
return -2;
addr->sll_family = AF_PACKET;
addr->sll_ifindex = req->ifr_ifru.ifru_ivalue;
if (trans)
addr->sll_protocol = 0;//tx only
else
addr->sll_protocol = htons(protocol);
addr->sll_pkttype = 0;
addr->sll_halen = 0;
addr->sll_hatype = 0;
if (ioctl(sock, SIOCGIFHWADDR, req) < 0)
return -3;
if (setsockopt(sock,SOL_SOCKET, SO_RCVBUFFORCE,&rsize,sizeof(rsize)) < 0)
return -4;
if (setsockopt(sock,SOL_SOCKET, SO_SNDBUFFORCE,&ssize,sizeof(ssize)) < 0)
return -5;
if (setsockopt(sock, SOL_PACKET, PACKET_VERSION, &version, sizeof(version)) < 0)
return -6;
if (setsockopt(sock, SOL_PACKET, PACKET_LOSS, &discardoff, sizeof(discardoff)) < 0)
return -7;
return sock;
}
void *slayout(void *ring, bool v3, size_t mmsize)
{
if (v3)
{
struct ring3_t *r = (struct ring3_t*)ring;
std::memset(&r->req,0,sizeof(r->req));
r->req.tp_block_nr = mmsize/BLOCK_SZ;
r->req.tp_block_size = BLOCK_SZ;
r->req.tp_frame_size = FRAME_SZ;
r->req.tp_frame_nr = (BLOCK_SZ/FRAME_SZ)*r->req.tp_block_nr;
r->req.tp_retire_blk_tov = 1;//1ms scanning interval
// r->req.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH;
r->req.tp_feature_req_word = 0;
ring = (void*)r;
} else {
struct ring_t *r = (struct ring_t *)ring;
std::memset(&r->req,0,sizeof(r->req));
r->req.tp_block_nr = mmsize/BLOCK_SZ;
r->req.tp_block_size = BLOCK_SZ;
r->req.tp_frame_size = FRAME_SZ;
r->req.tp_frame_nr = (BLOCK_SZ/FRAME_SZ)*r->req.tp_block_nr;
ring = (void*)r;
}
return ring;
}
void *setuprxring(int sock, struct ring3_t *ring, size_t mmsize)
{
if (slayout((void*)ring,true,mmsize) == NULL)
return NULL;
if (setsockopt(sock, SOL_PACKET,PACKET_RX_RING,(void*)&ring->req,sizeof(ring->req)) < 0)
return NULL;
ring->rx_buf = (uint8_t*)mmap(NULL,mmsize,PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_LOCKED,sock,0);
if (ring->rx_buf == MAP_FAILED)
return NULL;
return (void*)ring;
}
void *setuptxring(int sock, struct ring_t *ring, size_t mmsize)
{
if (slayout((void*)ring,false,mmsize)==NULL)
return NULL;
if (setsockopt(sock, SOL_PACKET, PACKET_TX_RING,(void*)&ring->req,sizeof(ring->req)) < 0)
return NULL;
ring->tx_buf = (uint8_t*)mmap(NULL,mmsize,PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_LOCKED,
sock,
0);
if (ring->tx_buf == MAP_FAILED)
return NULL;
return (void*)ring;
}
int main(int argc, char **argv)
{
if (argc != 2)
{
fprintf(stderr, "Usage: %s <dev_name>\n", argv[0]);
exit(EXIT_SUCCESS);
}
struct sockaddr_ll ifa;
struct ifreq ifr;
char *device = new char[IFNAMSIZ];
ring3_t rxring;
std::memset(&ifa,0,sizeof(ifa));
std::memset(&ifr,0,sizeof(ifr));
std::memset(&txring,0,sizeof(txring));
std::memset(&rxring,0,sizeof(rxring));
std::memset(device,0,IFNAMSIZ);
std::strcpy(device, argv[1]);
sockrx = pfsocket(ETH_P_ALL,TPACKET_V3,false,&ifr,&ifa,device,rbuf,sbuf);
if (sockrx < 0)
return sockrx;
fprintf(stderr, "Socket rx(%d) created\n",sockrx);
if (setuprxring(sockrx,&rxring,rbuf) == NULL)
return -8;
fprintf(stderr, "Ring rx setup done.\n");
if (bind(sockrx,(struct sockaddr*)&ifa,sizeof(ifa)) < 0)
return -9;
fprintf(stderr, "Socket rx(%d) bound to %s\n", sockrx, device);
socktx = pfsocket(ETH_P_ALL,TPACKET_V2,true,&ifr,&ifa,device,rbuf,sbuf);
if (socktx < 0)
return socktx;
fprintf(stderr, "Socket tx(%d) created\n", socktx);
if (setuptxring(socktx,&txring,sbuf) == NULL)
return -10;
fprintf(stderr, "Ring tx setup done.\n");
if (bind(socktx,(struct sockaddr*)&ifa,sizeof(ifa)) < 0)
return -11;
fprintf(stderr, "Socket tx(%d) bound to %s\n", socktx, device);
uint32_t nfds = 1;
int ret = 0, block = 0;
struct pollfd fds[nfds];
fds[0].fd = sockrx;
fds[0].events = POLLIN|POLLRDNORM|POLLERR;
fds[0].revents = 0;
sigset_t newmask, zeromask;
struct timespec tv;
std::memset(&tv,0,sizeof(tv));
sigemptyset(&zeromask);
sigemptyset(&newmask);
sigaddset(&newmask,SIGINT);
signal(SIGHUP, signal_handler);
while (1)
{
tv.tv_nsec = 1000000;//1ms
ret = ppoll(fds,nfds,&tv,&zeromask);
if (ret < 0 && errno == EINTR)
continue;
if (ret < 0)
{
fprintf(stderr, "ppoll() error:%s\n", strerror(errno));
exit(EXIT_FAILURE);
}
//read rxring every 1ms
walkrxring(sockrx,&block,&rxring);
//try to flush every 1ms
flushtx(socktx);
}
return 0;
}
next reply other threads:[~2014-07-21 13:25 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-07-21 13:18 Mihail Dakov [this message]
2014-07-21 13:38 ` AF_PACKET: tx_ring mirrored in rx_ring? Mihail Dakov
2014-07-21 13:51 ` Daniel Borkmann
2014-07-21 14:40 ` Mihail Dakov
2014-07-21 14:44 ` Fwd: " Mihail Dakov
2014-07-21 15:13 ` Daniel Borkmann
2014-07-21 18:32 ` mihail.dakov
2014-07-21 22:35 ` Willem de Bruijn
2014-07-21 22:36 ` Willem de Bruijn
2014-07-22 13:39 ` Mihail Dakov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=53CD1326.5090006@ng4t.com \
--to=mihail.dakov@ng4t.com \
--cc=linux-net@vger.kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.