Browse Source

Merge upstream version 16

David Martínez Moreno 16 years ago
parent
commit
b6f1fe61c1
11 changed files with 773 additions and 17 deletions
  1. 15 0
      HACKING
  2. 12 0
      NEWS
  3. 4 3
      README
  4. 16 9
      aoe.c
  5. 7 3
      ata.c
  6. 31 0
      contrib/vblade-15-aio.2.README
  7. 536 0
      contrib/vblade-15-aio.2.diff
  8. 11 0
      contrib/vblade-15-socketfilter.2.README
  9. 139 0
      contrib/vblade-15-socketfilter.2.diff
  10. 1 1
      dat.h
  11. 1 1
      fns.h

+ 15 - 0
HACKING

@@ -10,3 +10,18 @@ Patches should be clean (to the point and easy to read) and should do
 one thing.  Send multiple patches if necessary.  Patches should be
 generated with "diff -uprN" if possible, and should be designed to be
 applied with "patch -p1".
+
+When possible, the best way to submit a patch is by sending it to the
+aoetools-discuss list.  You can subscribe at the aoetools project web
+page on sourceforge.net.
+
+When you send your patch, here are some things to cover:
+
+  * What version of the vblade did you use to generate the patch?
+    (Hopefully it was the latest.)
+
+  * What was your motivation for creating the patch?  That is, what
+    problem does it solve?
+
+  * What testing did you perform to ensure that your patch did not
+    introduce bugs and accomplished what you intended?

+ 12 - 0
NEWS

@@ -1,4 +1,16 @@
 -*- change-log -*-
+2008-05-07 Ed L. Cashin <ecashin@coraid.com>
+	add Chris Webb's AIO patch to the contributions
+	add Chris Webb's BPF patch to the contributions
+	vblade-16
+
+2008-02-20 Ed L. Cashin <ecashin@coraid.com>
+	require the amount of data we use, not the amount ethernet requires
+	make sure the packet length agrees with the config query length
+	make sure the packet length agrees with the amount to write
+	remove newline embedded in fw version field of ATA dev ID response
+	vblade-15
+	
 2006-11-20 Sam Hopkins <sah@coraid.com>
 	apply contrib jumbo patch to standard distribution
 	add jumbo configuration app. note in README

+ 4 - 3
README

@@ -7,9 +7,10 @@ seekable file available over an ethernet local area network (LAN) via
 the ATA over Ethernet (AoE) protocol.
 
 The seekable file is typically a block device like /dev/md0 but even
-regular files will work.  When vblade exports the block storage over
-AoE it becomes a storage target.  Another host on the same LAN can
-access the storage if it has a compatible aoe kernel driver.
+regular files will work.  Sparse files can be especially convenient.
+When vblade exports the block storage over AoE it becomes a storage
+target.  Another host on the same LAN can access the storage if it has
+a compatible aoe kernel driver.
 
 BUILDING
 --------

+ 16 - 9
aoe.c

@@ -78,7 +78,7 @@ getlba(uchar *p)
 }
 
 int
-aoeata(Ata *p)	// do ATA reqeust
+aoeata(Ata *p, int pktlen)	// do ATA reqeust
 {
 	Ataregs r;
 	int len = 60;
@@ -88,7 +88,7 @@ aoeata(Ata *p)	// do ATA reqeust
 	r.sectors = p->sectors;
 	r.feature = p->err;
 	r.cmd = p->cmd;
-	if (atacmd(&r, (uchar *)(p+1), maxscnt*512) < 0) {
+	if (atacmd(&r, (uchar *)(p+1), maxscnt*512, pktlen - sizeof(*p)) < 0) {
 		p->h.flags |= Error;
 		p->h.error = BadArg;
 		return len;
@@ -109,13 +109,13 @@ aoeata(Ata *p)	// do ATA reqeust
 // yes, this makes unnecessary copies.
 
 int
-confcmd(Conf *p)	// process conf request
+confcmd(Conf *p, int payload)	// process conf request
 {
 	int len;
 
 	len = ntohs(p->len);
 	if (QCMD(p) != Qread)
-	if (len > Nconfig)
+	if (len > Nconfig || len > payload)
 		return 0;	// if you can't play nice ...
 	switch (QCMD(p)) {
 	case Qtest:
@@ -156,16 +156,23 @@ confcmd(Conf *p)	// process conf request
 }
 
 void
-doaoe(Aoehdr *p)
+doaoe(Aoehdr *p, int n)
 {
 	int len;
+	enum {	// config query header size
+		CHDR_SIZ = sizeof(Conf) - sizeof(((Conf *)0)->data),
+	};
 
 	switch (p->cmd) {
 	case ATAcmd:
-		len = aoeata((Ata*)p);
+		if (n < sizeof(Ata))
+			return;
+		len = aoeata((Ata*)p, n);
 		break;
 	case Config:
-		len = confcmd((Conf *)p);
+		if (n < CHDR_SIZ)
+			return;
+		len = confcmd((Conf *)p, n - CHDR_SIZ);
 		if (len == 0)
 			return;
 		break;
@@ -202,7 +209,7 @@ aoe(void)
 			perror("read network");
 			exit(1);
 		}
-		if (n < 60)
+		if (n < sizeof(Aoehdr))
 			continue;
 		p = (Aoehdr *) buf;
 		if (ntohs(p->type) != 0x88a2)
@@ -216,7 +223,7 @@ aoe(void)
 			continue;
 		if (nmasks && !maskok(p->src))
 			continue;
-		doaoe(p);
+		doaoe(p, n);
 	}
 	free(buf);
 }

+ 7 - 3
ata.c

@@ -86,7 +86,7 @@ atainit(void)
 	char buf[64];
 
 	setfld(ident, 27, 40, "Coraid EtherDrive vblade");
-	sprintf(buf, "V%d\n", VBLADE_VERSION);
+	sprintf(buf, "V%d", VBLADE_VERSION);
 	setfld(ident, 23, 8, buf);
 	setfld(ident, 10, 20, "SSN HERE");
 }
@@ -98,7 +98,7 @@ atainit(void)
  * check for that.
  */
 int
-atacmd(Ataregs *p, uchar *dp, int ndp)		// do the ata cmd
+atacmd(Ataregs *p, uchar *dp, int ndp, int payload) // do the ata cmd
 {
 	vlong lba;
 	ushort *ip;
@@ -156,8 +156,12 @@ atacmd(Ataregs *p, uchar *dp, int ndp)		// do the ata cmd
 	}
 	if (p->cmd == 0x20 || p->cmd == 0x24)
 		n = getsec(bfd, dp, lba, p->sectors);
-	else
+	else {
+		// packet should be big enough to contain the data
+		if (payload < 512 * p->sectors)
+			return -1;
 		n = putsec(bfd, dp, lba, p->sectors);
+	}
 	n /= 512;
 	if (n != p->sectors) {
 		p->err = ABRT;

+ 31 - 0
contrib/vblade-15-aio.2.README

@@ -0,0 +1,31 @@
+This proof-of-concept patch modifies vblade to access the underlying block
+device using POSIX asynchronous IO (AIO) rather than using normal blocking
+read() and write(). AIO allows vblade to receive and queue several several ATA
+read/write commands at once, returning the response to the client
+asynchronously as each IO operation completes. It should be most beneficial
+for devices which experience very non-sequential IO. An AIO-enabled vblade is
+also a good starting point if you want to generalise vblade to export multiple
+devices without the complexity and overhead of a multithreaded approach.
+
+The patch implements AIO support for both Linux and FreeBSD, but I have not
+tested the FreeBSD support and would therefore be especially interested to
+hear success/failure reports for compiling and running AIO vblade on FreeBSD.
+A SIGIO handler which writes a single byte to a pipe is used to notify the
+main poll() loop that AIO operations have completed and are ready to return to
+the client. Running oprofile on a box with a heavily loaded loopback
+vblade-aio suggests that it spends an inordinate amount of time in the signal
+handler. Some method of poll()ing directly on the AIO events at the same time
+as the socket fd could cut this overhead out completely.
+
+More generally, experimenting on Linux with standard O_DIRECT vblade and
+O_DIRECT vblade-aio on a loopback interface with MTU 9000 suggests that the
+performance difference on a single RAID1-backed block device is fairly small:
+swamped by the performance of the network and the underlying block device.
+However, the POSIX AIO in glibc librt is emulated in userspace threads rather
+than using the kernel AIO api. A kernel-backed POSIX AIO implementation should
+perform better, especially for multiple access to a single block device.
+
+I would be delighted to hear any feedback and experiences from people running
+vblade together with this patch.
+
+Chris Webb <chris@arachsys.com>, 2008-04-21.

+ 536 - 0
contrib/vblade-15-aio.2.diff

@@ -0,0 +1,536 @@
+diff -uprN vblade-15/aoe.c vblade-15-aio/aoe.c
+--- vblade-15/aoe.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/aoe.c	2008-04-19 22:31:21.000000000 +0100
+@@ -8,6 +8,9 @@
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ #include <netinet/in.h>
++#include <errno.h>
++#include <aio.h>
++#include <poll.h>
+ #include "dat.h"
+ #include "fns.h"
+ 
+@@ -22,6 +25,11 @@ char config[Nconfig];
+ int nconfig = 0;
+ int maxscnt = 2;
+ char *ifname;
++int queuepipe[2];
++int pktlen[Nplaces], pending[Nplaces];
++Ata *pkt[Nplaces];
++Ataregs regs[Nplaces];
++struct aiocb aiocb[Nplaces];
+ 
+ void
+ aoead(int fd)			// advertise the virtual blade
+@@ -78,32 +86,52 @@ getlba(uchar *p)
+ }
+ 
+ int
+-aoeata(Ata *p, int pktlen)	// do ATA reqeust
++aoeata(int place)	// do ATA reqeust
+ {
+-	Ataregs r;
+-	int len = 60;
+ 	int n;
++	int len = 60; // minimum ethernet packet size
+ 
+-	r.lba = getlba(p->lba);
+-	r.sectors = p->sectors;
+-	r.feature = p->err;
+-	r.cmd = p->cmd;
+-	if (atacmd(&r, (uchar *)(p+1), maxscnt*512, pktlen - sizeof(*p)) < 0) {
+-		p->h.flags |= Error;
+-		p->h.error = BadArg;
++	regs[place].lba = getlba(pkt[place]->lba);
++	regs[place].sectors = pkt[place]->sectors;
++	regs[place].feature = pkt[place]->err;
++	regs[place].cmd = pkt[place]->cmd;
++	n = atacmd(regs + place, (uchar *)(pkt[place] + 1), maxscnt*512,
++				pktlen[place] - sizeof(Ata), aiocb + place);
++	if (n < 0) {
++		pkt[place]->h.flags |= Error;
++		pkt[place]->h.error = BadArg;
+ 		return len;
++	} else if (n > 0) {
++		pending[place] = 1;
++		return 0;
++	}
++	if (!(pkt[place]->aflag & Write) && (n = pkt[place]->sectors)) {
++		n -= regs[place].sectors;
++		len = sizeof (Ata) + (n*512);
+ 	}
+-	if (!(p->aflag & Write))
+-	if ((n = p->sectors)) {
+-		n -= r.sectors;
++	pkt[place]->sectors = regs[place].sectors;
++	pkt[place]->err = regs[place].err;
++	pkt[place]->cmd = regs[place].status;
++	return len;
++}
++
++int aoeatacomplete(int place, int pktlen)
++{
++	int n;
++	int len = 60; // minimum ethernet packet size
++	atacmdcomplete(regs + place, aiocb + place);
++	if (!(pkt[place]->aflag & Write) && (n = pkt[place]->sectors)) {
++		n -= regs[place].sectors;
+ 		len = sizeof (Ata) + (n*512);
+ 	}
+-	p->sectors = r.sectors;
+-	p->err = r.err;
+-	p->cmd = r.status;
++	pkt[place]->sectors = regs[place].sectors;
++	pkt[place]->err = regs[place].err;
++	pkt[place]->cmd = regs[place].status;
++	pending[place] = 0;
+ 	return len;
+ }
+ 
++
+ #define QCMD(x) ((x)->vercmd & 0xf)
+ 
+ // yes, this makes unnecessary copies.
+@@ -156,8 +184,9 @@ confcmd(Conf *p, int payload)	// process
+ }
+ 
+ void
+-doaoe(Aoehdr *p, int n)
++doaoe(int place)
+ {
++	Aoehdr *p = (Aoehdr *) pkt[place];
+ 	int len;
+ 	enum {	// config query header size
+ 		CHDR_SIZ = sizeof(Conf) - sizeof(((Conf *)0)->data),
+@@ -165,14 +194,16 @@ doaoe(Aoehdr *p, int n)
+ 
+ 	switch (p->cmd) {
+ 	case ATAcmd:
+-		if (n < sizeof(Ata))
++		if (pktlen[place] < sizeof(Ata))
++			return;
++		len = aoeata(place);
++		if (len == 0)
+ 			return;
+-		len = aoeata((Ata*)p, n);
+ 		break;
+ 	case Config:
+-		if (n < CHDR_SIZ)
++		if (pktlen[place] < CHDR_SIZ)
+ 			return;
+-		len = confcmd((Conf *)p, n - CHDR_SIZ);
++		len = confcmd((Conf *)p, pktlen[place] - CHDR_SIZ);
+ 		if (len == 0)
+ 			return;
+ 		break;
+@@ -193,25 +224,129 @@ doaoe(Aoehdr *p, int n)
+ }
+ 
+ void
++doaoecomplete(int place)
++{
++	Aoehdr *p = (Aoehdr *) pkt[place];
++	int len = aoeatacomplete(place, pktlen[place]);
++	memmove(p->dst, p->src, 6);
++	memmove(p->src, mac, 6);
++	p->maj = htons(shelf);
++	p->min = slot;
++	p->flags |= Resp;
++	if (putpkt(sfd, (uchar *) p, len) == -1) {
++		perror("write to network");
++		exit(1);
++	}
++
++}
++
++// allocate the buffer so that the ata data area
++// is page aligned for o_direct on linux
++
++void *
++bufalloc(void **buf, long len)
++{
++	long psize;
++	unsigned long n;
++
++	psize = sysconf(_SC_PAGESIZE);
++	if (psize == -1) {
++		perror("sysconf");
++		exit(EXIT_FAILURE);
++	}
++	n = len/psize + 3;
++	*buf = malloc(psize * n);
++	if (!*buf) {
++		perror("malloc");
++		exit(EXIT_FAILURE);
++	}
++	n = (unsigned long) *buf;
++	n += psize * 2;
++	n &= ~(psize - 1);
++	return (void *) (n - sizeof (Ata));
++}
++
++void
++sigio(int signo) 
++{
++	const char dummy = 0;
++	write(queuepipe[1], &dummy, 1);
++}
++
++void
+ aoe(void)
+ {
+ 	Aoehdr *p;
+-	uchar *buf;
+-	int n, sh;
++	char dummy;
++	int n, place, sh;
+ 	enum { bufsz = 1<<16, };
+-
+-	buf = malloc(bufsz);
++	sigset_t mask, oldmask;
++	struct sigaction sigact;
++	struct pollfd pollfds[2];
++	void *freeme[Nplaces];
++
++	for (n = 0; n < Nplaces; n++) {
++		pkt[n] = bufalloc(freeme + n, bufsz);
++		pending[n] = 0;
++	}
+ 	aoead(sfd);
+ 
++	pipe(queuepipe);
++	fcntl(queuepipe[0], F_SETFL, O_NONBLOCK);
++	fcntl(queuepipe[1], F_SETFL, O_NONBLOCK);
++
++	sigemptyset(&sigact.sa_mask);
++	sigact.sa_flags = 0;
++	sigact.sa_sigaction = (void *) sigio;
++	sigaction(SIGIO, &sigact, NULL);
++
++	sigemptyset(&mask);
++	sigaddset(&mask, SIGIO);
++	sigprocmask(SIG_BLOCK, &mask, &oldmask);
++
++	pollfds[0].fd = queuepipe[0];
++	pollfds[1].fd = sfd;
++	pollfds[0].events = pollfds[1].events = POLLIN;
++
+ 	for (;;) {
+-		n = getpkt(sfd, buf, bufsz);
+-		if (n < 0) {
++		sigprocmask(SIG_SETMASK, &oldmask, NULL);
++		n = poll(pollfds, 2, 1000);
++		sigprocmask(SIG_BLOCK, &mask, NULL);
++
++		if (n < 0 && errno != EINTR) {
++			perror("poll");
++			continue;
++		} else if (n == 0 || pollfds[0].revents & POLLIN) {
++			while(read(queuepipe[0], &dummy, 1) > 0);
++			for (place = 0; place < Nplaces; place++) {
++				if (!pending[place])
++					continue;
++				if (aio_error(aiocb + place) == EINPROGRESS)
++					continue;
++				doaoecomplete(place);
++				pollfds[1].events = POLLIN;
++			}
++		}
++
++		if ((pollfds[1].revents & POLLIN) == 0)
++			continue;
++			
++		for (place = 0; pending[place] && place < Nplaces; place++);
++		if (place >= Nplaces) {
++			pollfds[1].events = 0;
++			continue;
++		}
++
++		pktlen[place] = getpkt(sfd, (uchar *) pkt[place], bufsz);
++		if (pktlen[place] < 0) {
++			if (errno == EINTR)
++				continue;
+ 			perror("read network");
+ 			exit(1);
+ 		}
+-		if (n < sizeof(Aoehdr))
++		if (pktlen[place] < sizeof(Aoehdr))
+ 			continue;
+-		p = (Aoehdr *) buf;
++		p = (Aoehdr *) pkt[place];
+ 		if (ntohs(p->type) != 0x88a2)
+ 			continue;
+ 		if (p->flags & Resp)
+@@ -223,9 +358,10 @@ aoe(void)
+ 			continue;
+ 		if (nmasks && !maskok(p->src))
+ 			continue;
+-		doaoe(p, n);
++		doaoe(place);
+ 	}
+-	free(buf);
++	for (place = 0; place < Nplaces; place++)
++		free(freeme[place]);
+ }
+ 
+ void
+@@ -317,7 +453,7 @@ main(int argc, char **argv)
+ 	}
+ 	if (s.st_mode & (S_IWUSR|S_IWGRP|S_IWOTH))
+ 		omode = O_RDWR;
+-	bfd = open(argv[3], omode);
++	bfd = opendisk(argv[3], omode);
+ 	if (bfd == -1) {
+ 		perror("open");
+ 		exit(1);
+diff -uprN vblade-15/ata.c vblade-15-aio/ata.c
+--- vblade-15/ata.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/ata.c	2008-04-19 22:12:32.000000000 +0100
+@@ -3,6 +3,8 @@
+ #include <string.h>
+ #include <stdio.h>
+ #include <sys/types.h>
++#include <errno.h>
++#include <aio.h>
+ #include "dat.h"
+ #include "fns.h"
+ 
+@@ -98,7 +100,7 @@ atainit(void)
+  * check for that.
+  */
+ int
+-atacmd(Ataregs *p, uchar *dp, int ndp, int payload) // do the ata cmd
++atacmd(Ataregs *p, uchar *dp, int ndp, int payload, struct aiocb *aiocb) // do the ata cmd
+ {
+ 	vlong lba;
+ 	ushort *ip;
+@@ -155,14 +157,29 @@ atacmd(Ataregs *p, uchar *dp, int ndp, i
+ 		return 0;
+ 	}
+ 	if (p->cmd == 0x20 || p->cmd == 0x24)
+-		n = getsec(bfd, dp, lba, p->sectors);
++		n = getsec(bfd, dp, lba, p->sectors, aiocb);
+ 	else {
+ 		// packet should be big enough to contain the data
+ 		if (payload < 512 * p->sectors)
+ 			return -1;
+-		n = putsec(bfd, dp, lba, p->sectors);
++		n = putsec(bfd, dp, lba, p->sectors, aiocb);
+ 	}
+-	n /= 512;
++	if (n < 0) {
++		p->err = ABRT;
++		p->status = ERR|DRDY;
++		p->lba += n;
++		p->sectors -= n;
++		return 0;
++	}
++	return 1; // callback expected
++}
++
++
++int
++atacmdcomplete(Ataregs *p, struct aiocb *aiocb) // complete the ata cmd
++{
++	int n;
++	n = aio_return(aiocb) / 512;
+ 	if (n != p->sectors) {
+ 		p->err = ABRT;
+ 		p->status = ERR;
+@@ -173,4 +190,3 @@ atacmd(Ataregs *p, uchar *dp, int ndp, i
+ 	p->sectors -= n;
+ 	return 0;
+ }
+-
+diff -uprN vblade-15/dat.h vblade-15-aio/dat.h
+--- vblade-15/dat.h	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/dat.h	2008-04-19 16:34:35.000000000 +0100
+@@ -111,6 +111,8 @@ enum {
+ 	Nconfig = 1024,
+ 
+ 	Bufcount = 16,
++
++	Nplaces = 32,
+ };
+ 
+ int	shelf, slot;
+diff -uprN vblade-15/fns.h vblade-15-aio/fns.h
+--- vblade-15/fns.h	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/fns.h	2008-04-17 00:11:36.000000000 +0100
+@@ -15,14 +15,16 @@ int	maskok(uchar *);
+ // ata.c
+ 
+ void	atainit(void);
+-int	atacmd(Ataregs *, uchar *, int, int);
++int	atacmd(Ataregs *, uchar *, int, int, struct aiocb *);
++int	atacmdcomplete(Ataregs *, struct aiocb *);
+ 
+ // os specific
+ 
+ int	dial(char *);
+ int	getea(int, char *, uchar *);
+-int	putsec(int, uchar *, vlong, int);
+-int	getsec(int, uchar *, vlong, int);
++int	opendisk(const char *, int);
++int	putsec(int, uchar *, vlong, int, struct aiocb *);
++int	getsec(int, uchar *, vlong, int, struct aiocb *);
+ int	putpkt(int, uchar *, int);
+ int	getpkt(int, uchar *, int);
+ vlong	getsize(int);
+diff -uprN vblade-15/freebsd.c vblade-15-aio/freebsd.c
+--- vblade-15/freebsd.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/freebsd.c	2008-04-19 22:24:57.000000000 +0100
+@@ -241,19 +241,40 @@ getea(int s, char *eth, uchar *ea)
+ 	return(0);
+ }
+ 
+-
+ int
+-getsec(int fd, uchar *place, vlong lba, int nsec)
++opendisk(const char *disk, int omode)
+ {
+-	return pread(fd, place, nsec * 512, lba * 512);
++	return open(disk, omode);
+ }
+ 
+ int
+-putsec(int fd, uchar *place, vlong lba, int nsec)
+-{
+-	return pwrite(fd, place, nsec * 512, lba * 512);
++getsec(int fd, uchar *place, vlong lba, int nsec, struct aiocb *aiocb)
++{       
++        bzero((char *) aiocb, sizeof(struct aiocb));
++        aiocb->aio_fildes = fd;
++        aiocb->aio_buf = place;
++        aiocb->aio_nbytes = nsec * 512;
++        aiocb->aio_offset = lba * 512;
++        aiocb->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
++        aiocb->aio_sigevent.sigev_signo = SIGIO;
++        aiocb->aio_sigevent.sigev_value.sival_ptr = aiocb;
++        return aio_read(aiocb);
+ }
+ 
++int
++putsec(int fd, uchar *place, vlong lba, int nsec, struct aiocb *aiocb)
++{       
++        bzero((char *) aiocb, sizeof(struct aiocb));
++        aiocb->aio_fildes = fd;
++        aiocb->aio_buf = place;
++        aiocb->aio_nbytes = nsec * 512;
++        aiocb->aio_offset = lba * 512;
++        aiocb->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
++        aiocb->aio_sigevent.sigev_signo = SIGIO;
++        aiocb->aio_sigevent.sigev_value.sival_ptr = aiocb;
++        return aio_write(aiocb);
++}       
++
+ static int pktn = 0;
+ static uchar *pktbp = NULL;
+ 
+diff -uprN vblade-15/linux.c vblade-15-aio/linux.c
+--- vblade-15/linux.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/linux.c	2008-04-19 22:23:51.000000000 +0100
+@@ -1,5 +1,6 @@
+ // linux.c: low level access routines for Linux
+ #include "config.h"
++#define _GNU_SOURCE
+ #include <sys/socket.h>
+ #include <stdio.h>
+ #include <string.h>
+@@ -22,6 +23,9 @@
+ #include <netinet/in.h>
+ #include <linux/fs.h>
+ #include <sys/stat.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <aio.h>
+ 
+ #include "dat.h"
+ #include "fns.h"
+@@ -29,8 +33,6 @@
+ int	getindx(int, char *);
+ int	getea(int, char *, uchar *);
+ 
+-
+-
+ int
+ dial(char *eth)		// get us a raw connection to an interface
+ {
+@@ -76,7 +78,7 @@ getea(int s, char *name, uchar *ea)
+ 	struct ifreq xx;
+ 	int n;
+ 
+-        strcpy(xx.ifr_name, name);
++	strcpy(xx.ifr_name, name);
+ 	n = ioctl(s, SIOCGIFHWADDR, &xx);
+ 	if (n == -1) {
+ 		perror("Can't get hw addr");
+@@ -102,17 +104,37 @@ getmtu(int s, char *name)
+ }
+ 
+ int
+-getsec(int fd, uchar *place, vlong lba, int nsec)
++opendisk(const char *disk, int omode)
++{
++	return open(disk, omode|O_DIRECT);
++}
++
++int
++getsec(int fd, uchar *place, vlong lba, int nsec, struct aiocb *aiocb)
+ {
+-	lseek(fd, lba * 512, 0);
+-	return read(fd, place, nsec * 512);
++	bzero((char *) aiocb, sizeof(struct aiocb));
++	aiocb->aio_fildes = fd;
++	aiocb->aio_buf = place;
++	aiocb->aio_nbytes = nsec * 512;
++	aiocb->aio_offset = lba * 512;
++	aiocb->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
++	aiocb->aio_sigevent.sigev_signo = SIGIO;
++	aiocb->aio_sigevent.sigev_value.sival_ptr = aiocb;
++	return aio_read(aiocb);
+ }
+ 
+ int
+-putsec(int fd, uchar *place, vlong lba, int nsec)
++putsec(int fd, uchar *place, vlong lba, int nsec, struct aiocb *aiocb)
+ {
+-	lseek(fd, lba * 512, 0);
+-	return write(fd, place, nsec * 512);
++	bzero((char *) aiocb, sizeof(struct aiocb));
++	aiocb->aio_fildes = fd;
++	aiocb->aio_buf = place;
++	aiocb->aio_nbytes = nsec * 512;
++	aiocb->aio_offset = lba * 512;
++	aiocb->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
++	aiocb->aio_sigevent.sigev_signo = SIGIO;
++	aiocb->aio_sigevent.sigev_value.sival_ptr = aiocb;
++	return aio_write(aiocb);
+ }
+ 
+ int
+diff -uprN vblade-15/linux.h vblade-15-aio/linux.h
+--- vblade-15/linux.h	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/linux.h	2008-04-16 23:03:07.000000000 +0100
+@@ -6,6 +6,6 @@ typedef long long vlong;
+ int	dial(char *);
+ int	getindx(int, char *);
+ int	getea(int, char *, uchar *);
+-int	getsec(int, uchar *, vlong, int);
+-int	putsec(int, uchar *, vlong, int);
++int	getsec(int, uchar *, vlong, int, struct aiocb *);
++int	putsec(int, uchar *, vlong, int, struct aiocb *);
+ vlong	getsize(int);
+diff -uprN vblade-15/makefile vblade-15-aio/makefile
+--- vblade-15/makefile	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-aio/makefile	2008-04-16 19:09:46.000000000 +0100
+@@ -13,7 +13,7 @@ CFLAGS += -Wall -g -O2
+ CC = gcc
+ 
+ vblade: $O
+-	${CC} -o vblade $O
++	${CC} -lrt -o vblade $O
+ 
+ aoe.o : aoe.c config.h dat.h fns.h makefile
+ 	${CC} ${CFLAGS} -c $<

+ 11 - 0
contrib/vblade-15-socketfilter.2.README

@@ -0,0 +1,11 @@
+This patch uses the Berkeley Packet Filter (BPF) feature so that the
+kernel only hands packets to a vblade process when they match its
+shelf and slot address.
+
+Without this patch, each vblade process would be woken up only to
+discard an AoE packet destined for a different vblade running on the
+same host.
+
+This patch currently introduces a build dependency on the headers from
+libpcap.  Usually a Linux distribution will include these headers in a
+package with a name something like "libpcap-dev".

+ 139 - 0
contrib/vblade-15-socketfilter.2.diff

@@ -0,0 +1,139 @@
+diff -uNpr vblade-15-orig/freebsd.c vblade-15-lsf/freebsd.c
+--- vblade-15-orig/freebsd.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-lsf/freebsd.c	2008-04-30 10:45:11.000000000 +0100
+@@ -54,40 +54,44 @@ dial(char *eth)
+ 
+ 	/* packet filter for bpf */
+ 	struct bpf_insn bpf_insns[] = {
+-	  /* Load the type into register */
+-	  BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 12),
+-	  /* Does it match AoE Type (0x88a2)? No, goto INVALID */
+-	  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0x88a2, 0, 10),
+-	  /* Load the flags into register */
+-	  BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 14),
+-	  /* Check to see if the Resp flag is set */
+-	  BPF_STMT(BPF_ALU+BPF_AND+BPF_K, Resp),
+-	  /* Yes, goto INVALID */
+-	  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 7),
+-	  /* Load the command into register */
+-	  BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 19),
+-	  /* Is this a ATAcmd? No, goto VALID */
+-	  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ATAcmd, 0, 4),
+-	  /* Load the shelf number into register */
+-	  BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 16),
+-	  /* Does it match shelf number? No, goto INVALID */
+-	  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) shelf, 0, 3),
+-	  /* Load the slot number into register */
+-	  BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 18),
+-	  /* Does it match shelf number? No, goto INVALID */
+-	  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) slot, 0, 1),
+-	  /* VALID: return -1 (allow the packet to be read) */
+-	  BPF_STMT(BPF_RET+BPF_K, (u_int)-1),
+-	  /* INVALID: return 0 (ignore the packet) */
+-	  BPF_STMT(BPF_RET+BPF_K, 0),
++		/* Load the type into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 12),
++		/* Does it match AoE Type (0x88a2)? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0x88a2, 0, 12),
++		/* Load the flags into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 14),
++		/* Check to see if the Resp flag is set */
++		BPF_STMT(BPF_ALU+BPF_AND+BPF_K, Resp),
++		/* Yes, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 9),
++		/* Load the shelf number into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 16),
++		/* Does it match shelf number? No, goto CHECKBROADCAST */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) shelf, 0, 2),
++		/* Load the slot number into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 18),
++		/* Does it match shelf number? Yes, goto VALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) slot, 4, 0),
++		/* CHECKBROADCAST: is (shelf, slot) == (0xffff, 0xff)? */
++		/* Load the shelf number into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 16),
++		/* Is it 0xffff? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) 0xffff, 0, 3),
++		/* Load the slot number into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 18),
++		/* Is it 0xff? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) 0xff, 0, 1),
++		/* VALID: return -1 (allow the packet to be read) */
++		BPF_STMT(BPF_RET+BPF_K, (u_int) -1),
++		/* INVALID: return 0 (ignore the packet) */
++		BPF_STMT(BPF_RET+BPF_K, 0),
+ 	};
+ 
+ 	struct bpf_program bpf_program = {
+-	  sizeof(bpf_insns)/sizeof(struct bpf_insn),
+-	  bpf_insns
++		sizeof(bpf_insns)/sizeof(struct bpf_insn),
++		bpf_insns
+ 	};
+-
+-
++	
+ 	strncpy(device, BPF_DEV, sizeof BPF_DEV);
+ 
+ 	/* find a bpf device we can use, check /dev/bpf[0-9] */
+diff -uNpr vblade-15-orig/linux.c vblade-15-lsf/linux.c
+--- vblade-15-orig/linux.c	2008-03-07 20:22:16.000000000 +0000
++++ vblade-15-lsf/linux.c	2008-04-30 10:46:04.000000000 +0100
+@@ -22,6 +22,7 @@
+ #include <netinet/in.h>
+ #include <linux/fs.h>
+ #include <sys/stat.h>
++#include <pcap-bpf.h>
+ 
+ #include "dat.h"
+ #include "fns.h"
+@@ -54,6 +55,49 @@ dial(char *eth)		// get us a raw connect
+ 		perror("bind funky");
+ 		return -1;
+ 	}
++
++	/* bpf packet filter for socket */
++	struct bpf_insn bpf_insns[] = {
++		/* Load the type into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 12),
++		/* Does it match AoE Type (0x88a2)? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0x88a2, 0, 12),
++		/* Load the flags into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 14),
++		/* Check to see if the Resp flag is set */
++		BPF_STMT(BPF_ALU+BPF_AND+BPF_K, Resp),
++		/* Yes, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 9),
++		/* Load the shelf number into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 16),
++		/* Does it match shelf number? No, goto CHECKBROADCAST */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) shelf, 0, 2),
++		/* Load the slot number into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 18),
++		/* Does it match shelf number? Yes, goto VALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) slot, 4, 0),
++		/* CHECKBROADCAST: is (shelf, slot) == (0xffff, 0xff)? */
++		/* Load the shelf number into register */
++		BPF_STMT(BPF_LD+BPF_H+BPF_ABS, 16),
++		/* Is it 0xffff? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) 0xffff, 0, 3),
++		/* Load the slot number into register */
++		BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 18),
++		/* Is it 0xff? No, goto INVALID */
++		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (u_int) 0xff, 0, 1),
++		/* VALID: return -1 (allow the packet to be read) */
++		BPF_STMT(BPF_RET+BPF_K, (u_int) -1),
++		/* INVALID: return 0 (ignore the packet) */
++		BPF_STMT(BPF_RET+BPF_K, 0),
++	};
++
++	struct bpf_program bpf_program = {
++		sizeof(bpf_insns)/sizeof(struct bpf_insn),
++		bpf_insns
++	};
++
++	setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_program, sizeof(bpf_program));
++
+ 	return s;
+ }
+ 

+ 1 - 1
dat.h

@@ -6,7 +6,7 @@
  */
 
 enum {
-	VBLADE_VERSION		= 12,
+	VBLADE_VERSION		= 16,
 
 	// Firmware version
 	FWV			= 0x4000 + VBLADE_VERSION,

+ 1 - 1
fns.h

@@ -15,7 +15,7 @@ int	maskok(uchar *);
 // ata.c
 
 void	atainit(void);
-int	atacmd(Ataregs *, uchar *, int);
+int	atacmd(Ataregs *, uchar *, int, int);
 
 // os specific