diff --git a/example/virtiofs/main.go b/example/virtiofs/main.go new file mode 100644 index 000000000..e61fe134c --- /dev/null +++ b/example/virtiofs/main.go @@ -0,0 +1,51 @@ +// Copyright 2024 the Go-FUSE Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "flag" + "log" + "net" + + "github.com/hanwen/go-fuse/v2/fs" + "github.com/hanwen/go-fuse/v2/fuse" + "github.com/hanwen/go-fuse/v2/vhostuser" +) + +func main() { + log.SetFlags(log.Lmicroseconds) + flag.Parse() + + sockpath := flag.Arg(0) + orig := flag.Arg(1) + l, err := net.ListenUnix("unix", &net.UnixAddr{sockpath, "unix"}) + if err != nil { + log.Fatal("Listen", err) + } + + root, err := fs.NewLoopbackRoot(orig) + if err != nil { + log.Fatal(err) + } + opts := &fs.Options{} + opts.Debug = true + opts.Logger = log.Default() + opts.MountOptions.Logger = opts.Logger + rawFS := fs.NewNodeFS(root, opts) + ps := fuse.NewProtocolServer(rawFS, &opts.MountOptions) + + for { + conn, err := l.AcceptUnix() + if err != nil { + break + } + + dev := vhostuser.NewFSDevice(ps) + srv := vhostuser.NewServer(conn, dev) + if err := srv.Serve(); err != nil { + log.Printf("Serve: %v %T", err, err) + } + } +} diff --git a/fuse/protocol-server_test.go b/fuse/protocol-server_test.go new file mode 100644 index 000000000..4e964f211 --- /dev/null +++ b/fuse/protocol-server_test.go @@ -0,0 +1,21 @@ +package fuse + +import ( + "log" + "testing" +) + +func TestProtocolServerParse(t *testing.T) { + in := [][]byte{ + []byte("A\x00\x00\x00\x16\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.\x04\x00\x00\x00\x00\x00\x00"), + []byte("\x00\x00\x00\x00\x00\x00\x00\x00security.selinux\x00"), + } + out := [][]byte{make([]byte, 16), make([]byte, 16)} + + opts := MountOptions{} + opts.Debug = true + opts.Logger = log.Default() + ps := NewProtocolServer(NewDefaultRawFileSystem(), &opts) + n, status := ps.HandleRequest(in, out) + log.Println(n, status) +} diff --git a/fuse/request.go b/fuse/request.go index 1bcdd391e..8b219c823 100644 --- a/fuse/request.go +++ b/fuse/request.go @@ -108,7 +108,7 @@ func (r *request) InputDebug() string { if h != nil && h.InType != nil { val = Print(asType(r.inData(), h.InType)) } - + log.Println(val) names := "" if h.FileNames == 1 { names = fmt.Sprintf(" %q", r.filename()) diff --git a/fuse/server.go b/fuse/server.go index b3f858d97..4c7ab95f1 100644 --- a/fuse/server.go +++ b/fuse/server.go @@ -74,17 +74,16 @@ func (ps *ProtocolServer) HandleRequest(in [][]byte, inTogether := make([]byte, len(in[0])+len(in[1])) copy(inTogether, in[0]) copy(inTogether[len(in[0]):], in[1]) - h, inSize, outSize, payloadSize, errno := parseRequest(inTogether, nil) + h, inSize, outSize, outPayloadSize, errno := parseRequest(inTogether, nil) if errno != 0 { return 0, errno } - - log.Printf("h: %v %d %d %d %v", h, inSize, outSize, payloadSize, errno) + log.Printf("%d %d payloadSize %d", inSize, outSize, outPayloadSize) req := request{ cancel: make(chan struct{}), inputBuf: inTogether[:inSize], - outputBuf: make([]byte, outSize+int(sizeOfOutHeader)), - inPayload: make([]byte, payloadSize), + outputBuf: make([]byte, outSize+int(sizeOfOutHeader)+outPayloadSize), + inPayload: inTogether[inSize:], } ps.protocolServer.handleRequest(h, &req) @@ -93,7 +92,7 @@ func (ps *ProtocolServer) HandleRequest(in [][]byte, if len(req.outputBuf) > len(out[0]) { copy(out[1], req.outputBuf[len(out[0]):]) } - if payloadSize > 0 { + if outPayloadSize > 0 { copy(out[len(out)-1], req.outPayload) } return len(req.outPayload) + len(req.outputBuf), 0 diff --git a/vhostuser/fs_test.go b/vhostuser/fs_test.go new file mode 100644 index 000000000..8e112adf4 --- /dev/null +++ b/vhostuser/fs_test.go @@ -0,0 +1,81 @@ +package vhostuser + +import ( + "log" + "net" + "os" + "os/exec" + "testing" + + "github.com/hanwen/go-fuse/v2/fs" + "github.com/hanwen/go-fuse/v2/fuse" +) + +func listenVFS(sockpath string, rawFS fuse.RawFileSystem, opts *fuse.MountOptions) { + os.Remove(sockpath) + l, err := net.ListenUnix("unix", &net.UnixAddr{sockpath, "unix"}) + if err != nil { + log.Fatal("Listen", err) + } + for { + conn, err := l.AcceptUnix() + if err != nil { + break + } + + ps := fuse.NewProtocolServer(rawFS, opts) + dev := NewFSDevice(ps) + srv := NewServer(conn, dev) + if err := srv.Serve(); err != nil { + log.Printf("Serve: %v %T", err, err) + } + } +} + +func TestBasic(t *testing.T) { + orig := t.TempDir() + if err := os.WriteFile(orig+"/file.txt", []byte("hello world\n"), 0666); err != nil { + t.Errorf("WriteFile: %v", err) + } + root, err := fs.NewLoopbackRoot(orig) + if err != nil { + t.Fatal(err) + } + opts := &fs.Options{} + opts.Debug = true + opts.Logger = log.Default() + opts.MountOptions.Logger = opts.Logger + rawFS := fs.NewNodeFS(root, opts) + + bindir := os.Getenv("HOME") + "/.cache/go-fuse-virtiofs" + sockpath := "/tmp/vhostqemu" + go listenVFS(sockpath, rawFS, &opts.MountOptions) + + cmd := exec.Command("qemu-system-x86_64", + "-M", "pc", "-m", "4G", "-cpu", "host", "-smp", "2", + "-enable-kvm", + + // to create the communications socket + "-chardev", "socket,id=char0,path="+sockpath, + + // instantiate the device + "-device", "vhost-user-fs-pci,queue-size=1024,chardev=char0,tag=myfs", + + // force use of memory sharable with virtiofsd. + "-object", "memory-backend-file,id=mem,size=4G,mem-path=/dev/shm,share=on", "-numa", "node,memdev=mem", + + "-kernel", bindir+"/bzImage", + "-initrd", bindir+"/initramfs.cpio.gz", + "-nographic", + "-append", + "console=ttyS0", + ) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + if err := cmd.Wait(); err != nil { + t.Fatal(err) + } +} diff --git a/vhostuser/server.go b/vhostuser/server.go new file mode 100644 index 000000000..f0fffb192 --- /dev/null +++ b/vhostuser/server.go @@ -0,0 +1,896 @@ +package vhostuser + +import ( + "fmt" + "log" + "net" + "reflect" + "sort" + "syscall" + "unsafe" + + "github.com/hanwen/go-fuse/v2/fuse" + "golang.org/x/sys/unix" +) + +type DeviceRegion struct { + VhostUserMemoryRegion + + // MmapAddr uint64 + Data []byte +} + +func (r *DeviceRegion) String() string { + return r.VhostUserMemoryRegion.String() +} + +func (r *DeviceRegion) containsGuestAddr(guestAddr uint64) bool { + return guestAddr >= r.GuestPhysAddr && guestAddr < r.GuestPhysAddr+r.MemorySize +} + +func (r *DeviceRegion) FromDriverAddr(driverAddr uint64) unsafe.Pointer { + if driverAddr < r.VhostUserMemoryRegion.DriverAddr || driverAddr >= r.DriverAddr+r.MemorySize { + return nil + } + + return unsafe.Pointer(&r.Data[driverAddr-r.DriverAddr+r.MmapOffset]) +} + +type FSDevice struct { + reqFD int + + // vring is the same as virtq? + vqs []Virtq + + // sorted by GuestPhysAddr + regions []DeviceRegion + LogTable []byte + + handle func(*Virtq, *VirtqElem) + + fusePS *fuse.ProtocolServer +} + +func NewFSDevice(ps *fuse.ProtocolServer) *FSDevice { + d := &FSDevice{ + vqs: make([]Virtq, 2), + fusePS: ps, + } + for i := range d.vqs { + d.vqs[i].Notification = true + } + + d.handle = func(vq *Virtq, e *VirtqElem) { + n, errno := ps.HandleRequest(e.read, e.write) + if errno != 0 { + log.Printf("%v", errno) + } + + d.pushQueue(vq, e, n) + d.queueNotify(vq) + } + return d +} + +type Ring struct { + Num int + Desc []VringDesc + Avail *VringAvail + AvailRing []uint16 + AvailUsedEvent *uint16 + Used *VringUsed + UsedRing []VringUsedElement + UsedAvailEvent *uint16 + + LogGuestAddr []byte + Flags uint32 +} + +type VirtqInflight struct { + Features uint64 + Version uint16 + DescNum uint16 + LastBatchHead uint16 + UsedIdx uint16 + + Desc0 DescStateSplit // array. +} + +type DescStateSplit struct { + inflight uint8 + padding [5]uint8 + next uint16 + counter uint64 +} + +type InflightDesc struct { + index uint16 + counter uint64 +} + +type Virtq struct { + Vring Ring + + Inflight *VirtqInflight + InflightDescs []DescStateSplit + ResubmitList *InflightDesc + + ResubmitNum uint16 + + Counter uint64 + LastAvailIdx uint16 + + ShadowAvailIdx uint16 + + UsedIdx uint16 + SignaledUsed uint16 + + SignaledUsedValid bool + Notification bool + + inuse uint + + handler func(*FSDevice, int) + + CallFD int + KickFD int + ErrFD int + Enable uint + Started bool + + Addr VhostVringAddr +} + +func (vq *Virtq) availIdx() uint16 { + // Weird, sideeffect? + vq.ShadowAvailIdx = vq.Vring.Avail.Idx + return vq.ShadowAvailIdx +} + +func (vq *Virtq) queueEmpty() bool { + // dev.broken + // vq.vring == nil + + if vq.ShadowAvailIdx != vq.LastAvailIdx { + return false + } + return vq.availIdx() == vq.LastAvailIdx +} + +func (d *FSDevice) MapRing(vq *Virtq) error { + if d := d.FromDriverAddr(vq.Addr.DescUserAddr); d == nil { + return fmt.Errorf("could not map DescUserAddr %x", vq.Addr.DescUserAddr) + } else { + vq.Vring.Desc = unsafe.Slice((*VringDesc)(d), vq.Vring.Num) + } + if d := d.FromDriverAddr(vq.Addr.UsedUserAddr); d == nil { + return fmt.Errorf("could not map UsedUserAddr %x", + vq.Addr.UsedUserAddr) + } else { + vq.Vring.Used = (*VringUsed)(d) + vq.Vring.UsedRing = unsafe.Slice(&vq.Vring.Used.Ring0, vq.Vring.Num) + //if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vq.Vring.UsedAvailEvent = (*uint16)(unsafe.Pointer(&unsafe.Slice(&vq.Vring.Used.Ring0, vq.Vring.Num+1)[vq.Vring.Num])) + + } + + if d := d.FromDriverAddr(vq.Addr.AvailUserAddr); d == nil { + return fmt.Errorf("could not map AvailUserAddr %x", + vq.Addr.AvailUserAddr) + } else { + vq.Vring.Avail = (*VringAvail)(d) + vq.Vring.AvailRing = unsafe.Slice(&vq.Vring.Avail.Ring0, vq.Vring.Num) + //if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vq.Vring.AvailUsedEvent = &unsafe.Slice(&vq.Vring.Avail.Ring0, vq.Vring.Num+1)[vq.Vring.Num] + } + return nil +} + +func (d *FSDevice) FromDriverAddr(driverAddr uint64) unsafe.Pointer { + for _, r := range d.regions { + d := r.FromDriverAddr(driverAddr) + if d != nil { + return d + } + } + return nil +} + +func (d *FSDevice) FromGuestAddr(guestAddr uint64, sz uint64) []byte { + idx := d.findRegionByGuestAddr(guestAddr) + r := d.regions[idx] + if !r.containsGuestAddr(guestAddr) { + return nil + } + + seg := r.Data[guestAddr-r.GuestPhysAddr:] + if len(seg) > int(sz) { + seg = seg[:sz] + } + return seg +} + +// https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#communication +// is incorrect regarding types. +func (d *FSDevice) SetLogBase(fd int, log *VhostUserLog) error { + data, err := syscall.Mmap(fd, int64(log.MmapOffset), int(log.MmapSize), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED) // |syscall.MAP_NORESERVE)? + syscall.Close(fd) + if err != nil { + return err + } + if d.LogTable != nil { + syscall.Munmap(d.LogTable) + } + + d.LogTable = data + return nil +} + +func (d *FSDevice) SetVringAddr(addr *VhostVringAddr) error { + vq := &d.vqs[addr.Index] + vq.Addr = *addr + vq.Vring.Flags = uint32(addr.Flags) // bitsize? + + logAddr := unsafe.Pointer(uintptr(addr.LogGuestAddr)) + vq.Vring.LogGuestAddr = unsafe.Slice((*byte)(logAddr), 0) // + + if err := d.MapRing(vq); err != nil { + return err + } + + vq.UsedIdx = vq.Vring.Used.Idx // LE16toH + if vq.LastAvailIdx != vq.UsedIdx { + resume := true // device->processed_in_order() + if resume { + vq.ShadowAvailIdx = vq.UsedIdx + vq.LastAvailIdx = vq.UsedIdx + } + } + + return nil +} + +func (d *FSDevice) SetVringNum(state *VhostVringState) { + d.vqs[state.Index].Vring.Num = int(state.Num) +} + +func (d *FSDevice) SetVringBase(state *VhostVringState) { + p := &d.vqs[state.Index] + p.ShadowAvailIdx = uint16(state.Num) + p.LastAvailIdx = uint16(state.Num) +} +func (d *FSDevice) SetVringEnable(state *VhostVringState) { + p := &d.vqs[state.Index] + p.Enable = uint(state.Num) + d.kickMe(state.Index) +} + +func clearSlice(s []byte) { + for i := range s { + s[i] = 0 + } +} + +func (d *FSDevice) kickMe(idx uint32) { + vq := &d.vqs[idx] + + go func() { + for { + var id [8]byte + _, err := syscall.Read(vq.KickFD, id[:]) + data, err := d.popQueue(vq) + if err != nil { + log.Printf("popq: %v", err) + continue + } + if data == nil { + log.Printf("queue was empty") + continue + } + for i, e := range data.read { + log.Printf("read %d: %q (%d)", i, e, len(e)) + } + outlens := []int{} + for _, e := range data.write { + clearSlice(e) + outlens = append(outlens, len(e)) + } + log.Printf("id %d: write space: %v", data.index, outlens) + + // should pass on vq as well? + if d.handle != nil { + d.handle(vq, data) + for i, e := range data.write { + log.Printf("write %d: %q (%d)", i, e, len(e)) + } + } else { + log.Printf("no handler defined") + } + } + }() +} + +type VirtqElem struct { + // this is the index into Vring.Desc + index uint + + // read and write from our perspective. The write field is for + // consumers (ie the file system). We return the total length + // to the driver, which can find the memory through the vring + // index above. + write [][]byte + read [][]byte +} + +func (d *FSDevice) dumpRegions() { + for i, r := range d.regions { + log.Printf("region %d: %v", i, &r) + } +} + +func (d *FSDevice) popQueue(vq *Virtq) (*VirtqElem, error) { + + /* TODO: unlikely conditions */ + + // dev->broken? + // vq.vring.avail == 0 + if vq.ResubmitList != nil && vq.ResubmitNum > 0 { + return nil, fmt.Errorf("resubmit") + } + + if vq.queueEmpty() { + return nil, nil + } + + if int(vq.inuse) >= vq.Vring.Num { + return nil, fmt.Errorf("virtq size exceeded") + } + + // todo RMB read barrier. + + idx := int(vq.LastAvailIdx) % vq.Vring.Num + + vq.LastAvailIdx++ + head := vq.Vring.AvailRing[idx] + if int(head) > vq.Vring.Num { + log.Panicf("silly avail %d %d", head, vq.Vring.Num) + } + if vq.Vring.UsedAvailEvent != nil { + *vq.Vring.UsedAvailEvent = vq.LastAvailIdx + } + + // vu_queue_map_desc + elem, err := d.queueMapDesc(vq, int(head)) + if elem == nil || err != nil { + return nil, err + } + vq.inuse++ + d.queueInflightGet(vq, int(head)) + return elem, nil +} + +func (d *FSDevice) logQueueFill(vq *Virtq, elem *VirtqElem, len int) { + // NOP, need LOG_SHMFD features +} + +func (d *FSDevice) pushQueue(vq *Virtq, elem *VirtqElem, len int) { + // vu_queue_fill + // > vu_log_queue_fill // log_write for UsedRing write + + // vu_queue_fill l.3103 + idx := int(vq.UsedIdx) % vq.Vring.Num + ue := VringUsedElement{ + ID: uint32(elem.index), + Len: uint32(len), + } + + log.Printf("putting used elem %v as idx %d", &ue, idx) + vq.Vring.UsedRing[idx] = ue + // > vring_used_write + // > vu_queue_inflight_pre_put(dev, vq, elem->index); + // only for VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + + // vu_queue_flush + // wmb barrier + + old := vq.UsedIdx + new := uint16(old + 1) // why not % num? + vq.UsedIdx = new + vq.Vring.Used.Idx = new + // log write + + vq.inuse-- + + // ? does this have something to do with u16 wrapping? + if new-vq.SignaledUsed < new-old { + vq.SignaledUsedValid = false + } + + //vu_queue_inflight_post_put(dev, vq, elem->index); + // only for VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD +} + +// virtio-ring.h +func VringNeedEvent(eventIdx uint16, newIdx, old uint16) bool { + return newIdx-eventIdx-1 < newIdx-old +} + +func (d *FSDevice) vringNotify(vq *Virtq) bool { + // mem barrier + + // if F_NOTIFY_ON_EMPTY ... + + // if ! F_EVENT_IDX ... + + v := vq.SignaledUsedValid + old := vq.SignaledUsed + new := vq.UsedIdx + vq.SignaledUsed = new + vq.SignaledUsedValid = true + return !v || VringNeedEvent(*vq.Vring.AvailUsedEvent, new, old) +} + +func (d *FSDevice) queueNotify(vq *Virtq) { + if !d.vringNotify(vq) { + log.Printf("queueNotify: skipped") + return + } + + // if INBAND_NOTIFICATIONS ... + var payload [8]byte + payload[0] = 1 + if _, err := syscall.Write(vq.CallFD, payload[:]); err != nil { + log.Panicf("eventfd write: %v", err) + } +} + +// set bit in dev.LogTable bitvector . the bitvector indexes 4k pages +// this lets the guest know there was a write in the page. Needs +// LOG_SHMFD feature. +func (d *FSDevice) logWrite(address, sz uint64) { + if d.LogTable == nil || sz == 0 { + return + } + + // if !F_LOG_ALL return + // mark addr in the d.LogTable bitvector. + // kick the log fd. +} + +func (d *FSDevice) queueInflightGet(vq *Virtq, head int) { + // VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + if vq.Inflight == nil { + // always returns here + return + } + vq.InflightDescs[head].counter = vq.Counter + vq.Counter++ + vq.InflightDescs[head].inflight = 1 +} + +func (d *FSDevice) queueMapDesc(vq *Virtq, head int) (*VirtqElem, error) { + result := VirtqElem{ + index: uint(head), + } + + descArray := vq.Vring.Desc + desc := descArray[head] + if desc.Flags&VRING_DESC_F_INDIRECT != 0 { + eltSize := unsafe.Sizeof(VringDesc{}) + if (desc.Len % uint32(eltSize)) != 0 { + return nil, fmt.Errorf("modulo size") + } + + indirectAsBytes := d.FromGuestAddr(desc.Addr, uint64(desc.Len)) + if indirectAsBytes == nil { + return nil, fmt.Errorf("OOB read %x %#v", desc.Addr, d.regions) + } + if len(indirectAsBytes) != int(desc.Len) { + return nil, fmt.Errorf("partial read indirect desc") + } + n := desc.Len / uint32(eltSize) + descArray = unsafe.Slice((*VringDesc)(unsafe.Pointer(&indirectAsBytes[0])), n) + desc = descArray[0] + } + + for { + iov := d.readVringEntry(desc.Addr, desc.Len) + if desc.Flags&VRING_DESC_F_WRITE != 0 { + // virtqueue_map_desc + result.write = append(result.write, iov...) + } else { + result.read = append(result.read, iov...) + } + // + + if desc.Flags&VRING_DESC_F_NEXT == 0 { + break + } + + head = int(desc.Next) + // barrier + + // todo: check max + + desc = descArray[head] + } + + return &result, nil +} + +// take VIRTQUEUE_MAX_SIZE ? +func (d *FSDevice) readVringEntry(physAddr uint64, sz uint32) [][]byte { + var result [][]byte + + for sz > 0 { + d := d.FromGuestAddr(physAddr, uint64(sz)) + result = append(result, d) + sz -= uint32(len(d)) + physAddr += uint64(len(d)) + } + + return result +} + +func (d *FSDevice) findRegionByGuestAddr(guestAddr uint64) int { + return sort.Search(len(d.regions), + func(i int) bool { + return guestAddr < d.regions[i].GuestPhysAddr+d.regions[i].MemorySize + }) +} + +func (d *FSDevice) AddMemReg(fd int, reg *VhostUserMemoryRegion) error { + if len(d.regions) == int(d.GetMaxMemslots()) { + return fmt.Errorf("hot add memory") + } + + idx := d.findRegionByGuestAddr(reg.GuestPhysAddr) + if hps := GetFDHugepagesize(fd); hps != 0 { + return fmt.Errorf("huge pages") + } + + data, err := syscall.Mmap(fd, int64(reg.MmapOffset), int(reg.MemorySize), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED|syscall.MAP_NORESERVE) + if err != nil { + return err + } + syscall.Madvise(data, unix.MADV_DONTDUMP) + + d.regions = append(d.regions, DeviceRegion{}) + copy(d.regions[idx+1:], d.regions[idx:]) + d.regions[idx] = DeviceRegion{ + VhostUserMemoryRegion: VhostUserMemoryRegion{ + GuestPhysAddr: reg.GuestPhysAddr, + MemorySize: reg.MemorySize, + DriverAddr: reg.DriverAddr, + MmapOffset: 0, // input holds the offset into the fd. + }, + Data: data, + } + return nil +} + +func (d *FSDevice) SetVringKick(fd int, index uint64) error { + if index&(1<<8) != 0 { + log.Panic("not supported") + } + old := d.vqs[index].KickFD + if old != 0 { + syscall.Close(old) + } + d.vqs[index].KickFD = fd + + return syscall.SetNonblock(fd, false) +} + +// todo consolidate +func (d *FSDevice) SetVringErr(fd int, index uint64) { + if index&(1<<8) != 0 { + log.Panic("not supported") + } + + if old := d.vqs[index].ErrFD; old != 0 { + syscall.Close(old) + } + + d.vqs[index].ErrFD = fd +} + +func (d *FSDevice) SetVringCall(fd int, index uint64) { + if index&(1<<8) != 0 { + log.Panic("not supported") + } + if old := d.vqs[index].CallFD; old != 0 { + syscall.Close(old) + } + d.vqs[index].CallFD = fd +} + +func (d *FSDevice) SetOwner() { + +} + +func (d *FSDevice) SetReqFD(fd int) { + d.reqFD = fd +} + +const MAX_MEM_SLOTS = 509 + +func (d *FSDevice) GetMaxMemslots() uint64 { + return MAX_MEM_SLOTS +} + +func (d *FSDevice) GetQueueNum() uint64 { + return uint64(len(d.vqs)) +} + +func (h *FSDevice) GetFeatures() []int { + return []int{ + //"\0\0\0p\1\0\0\0" + RING_F_INDIRECT_DESC, + RING_F_EVENT_IDX, + F_PROTOCOL_FEATURES, + F_VERSION_1, + } +} + +func (h *FSDevice) SetFeatures(fs []int) { + +} + +func (h *FSDevice) SetProtocolFeatures([]int) { + +} + +// not supporting VHOST_USER_PROTOCOL_F_PAGEFAULT, so no support for +// postcopy listening. +func (h *FSDevice) GetProtocolFeatures() []int { + // ")\204\0\0\0\0\0\0" + // x29 x84 + return []int{ + PROTOCOL_F_MQ, + PROTOCOL_F_REPLY_ACK, + PROTOCOL_F_BACKEND_REQ, + PROTOCOL_F_BACKEND_SEND_FD, + PROTOCOL_F_CONFIGURE_MEM_SLOTS, + } +} + +type Server struct { + conn *net.UnixConn + device *FSDevice +} + +type empty struct{} + +func NewServer(c *net.UnixConn, d *FSDevice) *Server { + return &Server{conn: c, device: d} +} + +func (s *Server) Serve() error { + for { + if err := s.oneRequest(); err != nil { + return err + } + } +} +func composeMask(fs []int) uint64 { + var mask uint64 + for _, f := range fs { + mask |= (uint64(0x1) << f) + } + return mask +} + +func (s *Server) getProtocolFeatures(rep *GetProtocolFeaturesReply) { + rep.Mask = composeMask(s.device.GetProtocolFeatures()) +} +func (s *Server) setProtocolFeatures(rep *SetProtocolFeaturesRequest) { +} + +func (s *Server) getFeatures(rep *GetFeaturesReply) { + rep.Mask = composeMask(s.device.GetFeatures()) +} + +func (s *Server) setFeatures(rep *SetFeaturesRequest) { +} + +const hdrSize = int(unsafe.Sizeof(Header{})) + +func (s *Server) oneRequest() error { + var inBuf, oobBuf, outBuf [4096]byte + + // _ = flags is usually CLOEXEC. + bufN, oobN, _, _, err := s.conn.ReadMsgUnix(inBuf[:hdrSize], oobBuf[:]) + oob := oobBuf[:oobN] + if err != nil { + return err + } + + inHeader := (*Header)(unsafe.Pointer(&inBuf[0])) + reqName := (reqNames[int(inHeader.Request)]) + + var inFDs []int + if len(oob) > 0 { + scms, err := syscall.ParseSocketControlMessage(oob) + if err != nil { + return err + } + for _, scm := range scms { + fds, err := syscall.ParseUnixRights(&scm) + if err != nil { + return err + } + inFDs = append(inFDs, fds...) + + // TODO make sockets non-blocking? See util/vhost-user-server.c l.179 + } + } + + if inHeader.Size > 0 { + bufN2, oobN2, flags2, addr2, err := s.conn.ReadMsgUnix(inBuf[hdrSize:hdrSize+int(inHeader.Size)], oobBuf[oobN:]) + if err != nil { + return err + } + if bufN2 < int(inHeader.Size) { + return fmt.Errorf("short read got %d want %d", bufN2, inHeader.Size) + } + oobN += oobN2 + bufN += bufN2 + + if oobN2 > 0 { + log.Printf("oob2 %q flags2 %x addr2 %x", oobBuf[oobN:oobN2+oobN], flags2, addr2) + } + } + + inPayload := unsafe.Pointer(&inBuf[hdrSize]) + inDebug := "" + if f := decodeIn[inHeader.Request]; f != nil { + // TODO - check payload size + inDebug = fmt.Sprintf("%v", f(inPayload)) + } else if inHeader.Size > 0 { + inDebug = fmt.Sprintf("payload %q (%d bytes)", inBuf[hdrSize:hdrSize+int(inHeader.Size)], inHeader.Size) + } + + needReply := (inHeader.Flags & (0x1 << 3)) != 0 + flagStr := "" + if needReply { + flagStr = "need_reply " + } + log.Printf("rx %-2d %s %s %sFDs %v", inHeader.Request, reqName, inDebug, flagStr, inFDs) + + if c := inFDCount[inHeader.Request]; c != len(inFDs) { + return fmt.Errorf("got %d fds for %s, want %d", len(inFDs), reqName, c) + } + + var outHeader = (*Header)(unsafe.Pointer(&outBuf[0])) + outPayloadPtr := unsafe.Pointer(&outBuf[hdrSize]) + inPayloadPtr := unsafe.Pointer(&inBuf[hdrSize]) + *outHeader = *inHeader + outHeader.Flags |= 0x4 // reply + + var rep interface{} + var deviceErr error + switch inHeader.Request { + case REQ_GET_FEATURES: + r := (*GetFeaturesReply)(outPayloadPtr) + s.getFeatures(r) + rep = r + case REQ_SET_FEATURES: + req := (*SetFeaturesRequest)(inPayloadPtr) + s.setFeatures(req) + case REQ_GET_PROTOCOL_FEATURES: + r := (*GetProtocolFeaturesReply)(outPayloadPtr) + s.getProtocolFeatures(r) + rep = r + case REQ_SET_PROTOCOL_FEATURES: + req := (*SetProtocolFeaturesRequest)(inPayloadPtr) + s.setProtocolFeatures(req) + + case REQ_GET_QUEUE_NUM: + r := (*U64Payload)(outPayloadPtr) + r.Num = s.device.GetQueueNum() + rep = r + case REQ_GET_MAX_MEM_SLOTS: + r := (*U64Payload)(outPayloadPtr) + r.Num = s.device.GetMaxMemslots() + rep = r + case REQ_SET_BACKEND_REQ_FD: + s.device.SetReqFD(inFDs[0]) + case REQ_SET_OWNER: + // should pass in addr or something? + s.device.SetOwner() + case REQ_SET_VRING_CALL: + req := (*U64Payload)(inPayloadPtr) + s.device.SetVringCall(inFDs[0], req.Num) + case REQ_SET_VRING_ERR: + req := (*U64Payload)(inPayloadPtr) + s.device.SetVringErr(inFDs[0], req.Num) + case REQ_SET_VRING_KICK: + req := (*U64Payload)(inPayloadPtr) + deviceErr = s.device.SetVringKick(inFDs[0], req.Num) + case REQ_ADD_MEM_REG: + // req can also be u64 if in postcopy mode (sigh). + req := (*VhostUserMemRegMsg)(inPayloadPtr) + deviceErr = s.device.AddMemReg(inFDs[0], &req.Region) + case REQ_SET_VRING_NUM: + req := (*VhostVringState)(inPayloadPtr) + s.device.SetVringNum(req) + case REQ_SET_VRING_BASE: + req := (*VhostVringState)(inPayloadPtr) + s.device.SetVringBase(req) + case REQ_SET_VRING_ENABLE: + req := (*VhostVringState)(inPayloadPtr) + s.device.SetVringEnable(req) + case REQ_SET_VRING_ADDR: + req := (*VhostVringAddr)(inPayloadPtr) + deviceErr = s.device.SetVringAddr(req) + case REQ_SET_LOG_BASE: + req := (*VhostUserLog)(inPayloadPtr) + s.device.SetLogBase(inFDs[0], req) + default: + log.Printf("unknown operation %d", inHeader.Request) + } + + outPayloadSz := 0 + if needReply && rep == nil { + r := (*U64Payload)(outPayloadPtr) + if deviceErr != nil { + log.Printf("request error: %v", deviceErr) + r.Num = 1 + } else { + r.Num = 0 + } + rep = r + + // qemu doesn't like NEED_REPLY + outHeader.Flags ^= (1 << 3) + } else if deviceErr != nil { + log.Printf("device error: %v", deviceErr) + } + + var repBytes []byte + outDebug := "no reply" + if rep != nil { + outPayloadSz = int(reflect.ValueOf(rep).Elem().Type().Size()) + outHeader.Size = uint32(outPayloadSz) + repBytes = outBuf[:hdrSize+outPayloadSz] + + if s, ok := rep.(fmt.Stringer); ok { + outDebug = s.String() + } else { + outDebug = fmt.Sprintf("payload %q (%d bytes)", repBytes[hdrSize:], outPayloadSz) + } + } + + log.Printf("tx %s %s", reqName, outDebug) + + if len(repBytes) > 0 { + if _, err := s.conn.Write(repBytes); err != nil { + log.Printf("%v %T", err, err) + return err + } + } + return nil +} + +const HUGETLBFS_MAGIC = 0x958458f6 + +func GetFDHugepagesize(fd int) int { + var fs syscall.Statfs_t + var err error + for { + err = syscall.Fstatfs(fd, &fs) + if err != syscall.EINTR { + break + } + } + + if err == nil && fs.Type == HUGETLBFS_MAGIC { + return int(fs.Bsize) + } + return 0 +} diff --git a/vhostuser/types.go b/vhostuser/types.go new file mode 100644 index 000000000..5829e4a1f --- /dev/null +++ b/vhostuser/types.go @@ -0,0 +1,544 @@ +package vhostuser + +import ( + "fmt" + "strconv" + "strings" + "unsafe" +) + +// protocol features vhost-user.h +const ( + PROTOCOL_F_MQ = 0 + PROTOCOL_F_LOG_SHMFD = 1 + PROTOCOL_F_RARP = 2 + PROTOCOL_F_REPLY_ACK = 3 + PROTOCOL_F_NET_MTU = 4 + PROTOCOL_F_BACKEND_REQ = 5 + PROTOCOL_F_CROSS_ENDIAN = 6 + PROTOCOL_F_CRYPTO_SESSION = 7 + PROTOCOL_F_PAGEFAULT = 8 + PROTOCOL_F_CONFIG = 9 + PROTOCOL_F_BACKEND_SEND_FD = 10 + PROTOCOL_F_HOST_NOTIFIER = 11 + PROTOCOL_F_INFLIGHT_SHMFD = 12 + PROTOCOL_F_RESET_DEVICE = 13 + PROTOCOL_F_INBAND_NOTIFICATIONS = 14 + PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15 + PROTOCOL_F_STATUS = 16 + /* Feature 17 reserved for PROTOCOL_F_XEN_MMAP. */ + PROTOCOL_F_SHARED_OBJECT = 18 + PROTOCOL_F_DEVICE_STATE = 19 + PROTOCOL_F_MAX = 20 +) + +var protocolFeatureNames = map[int]string{ + PROTOCOL_F_MQ: "MQ", + PROTOCOL_F_LOG_SHMFD: "LOG_SHMFD", + PROTOCOL_F_RARP: "RARP", + PROTOCOL_F_REPLY_ACK: "REPLY_ACK", + PROTOCOL_F_NET_MTU: "NET_MTU", + PROTOCOL_F_BACKEND_REQ: "BACKEND_REQ", + PROTOCOL_F_CROSS_ENDIAN: "CROSS_ENDIAN", + PROTOCOL_F_CRYPTO_SESSION: "CRYPTO_SESSION", + PROTOCOL_F_PAGEFAULT: "PAGEFAULT", + PROTOCOL_F_CONFIG: "CONFIG", + PROTOCOL_F_BACKEND_SEND_FD: "BACKEND_SEND_FD", + PROTOCOL_F_HOST_NOTIFIER: "HOST_NOTIFIER", + PROTOCOL_F_INFLIGHT_SHMFD: "INFLIGHT_SHMFD", + PROTOCOL_F_RESET_DEVICE: "RESET_DEVICE", + PROTOCOL_F_INBAND_NOTIFICATIONS: "INBAND_NOTIFICATIONS", + PROTOCOL_F_CONFIGURE_MEM_SLOTS: "CONFIGURE_MEM_SLOTS", + PROTOCOL_F_STATUS: "STATUS", + /* Feature 17 reserved for PROTOCOL_F_XEN_MMAP. */ + PROTOCOL_F_SHARED_OBJECT: "SHARED_OBJECT", + PROTOCOL_F_DEVICE_STATE: "DEVICE_STATE", + PROTOCOL_F_MAX: "MAX", +} + +// include/standard-headers/linux/virtio_config.h +// include/standard-headers/linux/vhost_types.h +const ( + F_NOTIFY_ON_EMPTY = 24 + F_LOG_ALL = 26 + + F_ANY_LAYOUT = 27 + + // include/standard-headers/linux/virtio_ring.h + // https://stackoverflow.com/questions/46334546/what-is-indirect-buffer-and-indirect-descriptor + RING_F_INDIRECT_DESC = 28 + RING_F_EVENT_IDX = 29 + + F_PROTOCOL_FEATURES = 30 + + F_VERSION_1 = 32 + F_ACCESS_PLATFORM = 33 + F_RING_PACKED = 34 + F_IN_ORDER = 35 + F_ORDER_PLATFORM = 36 + F_SR_IOV = 37 + F_NOTIFICATION_DATA = 38 + F_NOTIF_CONFIG_DATA = 39 + F_RING_RESET = 40 + F_ADMIN_VQ = 41 +) + +var featureNames = map[int]string{ + F_NOTIFY_ON_EMPTY: "NOTIFY_ON_EMPTY", + F_LOG_ALL: "LOG_ALL", + F_ANY_LAYOUT: "ANY_LAYOUT", + RING_F_INDIRECT_DESC: "RING_F_INDIRECT_DESC", + RING_F_EVENT_IDX: "RING_F_EVENT_IDX", + F_PROTOCOL_FEATURES: "PROTOCOL_FEATURES", + + F_VERSION_1: "VERSION_1", + F_ACCESS_PLATFORM: "ACCESS_PLATFORM", + F_RING_PACKED: "RING_PACKED", + F_IN_ORDER: "IN_ORDER", + F_ORDER_PLATFORM: "ORDER_PLATFORM", + F_SR_IOV: "SR_IOV", + F_NOTIFICATION_DATA: "NOTIFICATION_DATA", + F_NOTIF_CONFIG_DATA: "NOTIF_CONFIG_DATA", + F_RING_RESET: "RING_RESET", + F_ADMIN_VQ: "ADMIN_VQ", +} + +func maskToString(names map[int]string, mask uint64) string { + var f []string + for j := 0; j < 64; j++ { + m := uint64(0x1) << j + if mask&m != 0 { + nm := names[j] + if nm == "" { + nm = strconv.Itoa(j) + } + f = append(f, nm) + } + } + return strings.Join(f, ",") +} + +// VhostUserRequest +const ( + REQ_NONE = 0 + REQ_GET_FEATURES = 1 + REQ_SET_FEATURES = 2 + REQ_SET_OWNER = 3 + REQ_RESET_OWNER = 4 + REQ_SET_MEM_TABLE = 5 + REQ_SET_LOG_BASE = 6 + REQ_SET_LOG_FD = 7 + REQ_SET_VRING_NUM = 8 + REQ_SET_VRING_ADDR = 9 + REQ_SET_VRING_BASE = 10 + REQ_GET_VRING_BASE = 11 + REQ_SET_VRING_KICK = 12 + REQ_SET_VRING_CALL = 13 + REQ_SET_VRING_ERR = 14 + REQ_GET_PROTOCOL_FEATURES = 15 + REQ_SET_PROTOCOL_FEATURES = 16 + REQ_GET_QUEUE_NUM = 17 + REQ_SET_VRING_ENABLE = 18 + REQ_SEND_RARP = 19 + REQ_NET_SET_MTU = 20 + REQ_SET_BACKEND_REQ_FD = 21 + REQ_IOTLB_MSG = 22 + REQ_SET_VRING_ENDIAN = 23 + REQ_GET_CONFIG = 24 + REQ_SET_CONFIG = 25 + REQ_CREATE_CRYPTO_SESSION = 26 + REQ_CLOSE_CRYPTO_SESSION = 27 + REQ_POSTCOPY_ADVISE = 28 + REQ_POSTCOPY_LISTEN = 29 + REQ_POSTCOPY_END = 30 + REQ_GET_INFLIGHT_FD = 31 + REQ_SET_INFLIGHT_FD = 32 + REQ_GPU_SET_SOCKET = 33 + REQ_RESET_DEVICE = 34 + /* Message number 35 reserved for REQ_VRING_KICK. */ + REQ_GET_MAX_MEM_SLOTS = 36 + REQ_ADD_MEM_REG = 37 + REQ_REM_MEM_REG = 38 + REQ_SET_STATUS = 39 + REQ_GET_STATUS = 40 + REQ_GET_SHARED_OBJECT = 41 + REQ_SET_DEVICE_STATE_FD = 42 + REQ_CHECK_DEVICE_STATE = 43 + REQ_MAX = 44 +) + +var reqNames = map[int]string{ + REQ_NONE: "NONE", + REQ_GET_FEATURES: "GET_FEATURES", + REQ_SET_FEATURES: "SET_FEATURES", + REQ_SET_OWNER: "SET_OWNER", + REQ_RESET_OWNER: "RESET_OWNER", + REQ_SET_MEM_TABLE: "SET_MEM_TABLE", + REQ_SET_LOG_BASE: "SET_LOG_BASE", + REQ_SET_LOG_FD: "SET_LOG_FD", + REQ_SET_VRING_NUM: "SET_VRING_NUM", + REQ_SET_VRING_ADDR: "SET_VRING_ADDR", + REQ_SET_VRING_BASE: "SET_VRING_BASE", + REQ_GET_VRING_BASE: "GET_VRING_BASE", + REQ_SET_VRING_KICK: "SET_VRING_KICK", + REQ_SET_VRING_CALL: "SET_VRING_CALL", + REQ_SET_VRING_ERR: "SET_VRING_ERR", + REQ_GET_PROTOCOL_FEATURES: "GET_PROTOCOL_FEATURES", + REQ_SET_PROTOCOL_FEATURES: "SET_PROTOCOL_FEATURES", + REQ_GET_QUEUE_NUM: "GET_QUEUE_NUM", + REQ_SET_VRING_ENABLE: "SET_VRING_ENABLE", + REQ_SEND_RARP: "SEND_RARP", + REQ_NET_SET_MTU: "NET_SET_MTU", + REQ_SET_BACKEND_REQ_FD: "SET_BACKEND_REQ_FD", + REQ_IOTLB_MSG: "IOTLB_MSG", + REQ_SET_VRING_ENDIAN: "SET_VRING_ENDIAN", + REQ_GET_CONFIG: "GET_CONFIG", + REQ_SET_CONFIG: "SET_CONFIG", + REQ_CREATE_CRYPTO_SESSION: "CREATE_CRYPTO_SESSION", + REQ_CLOSE_CRYPTO_SESSION: "CLOSE_CRYPTO_SESSION", + REQ_POSTCOPY_ADVISE: "POSTCOPY_ADVISE", + REQ_POSTCOPY_LISTEN: "POSTCOPY_LISTEN", + REQ_POSTCOPY_END: "POSTCOPY_END", + REQ_GET_INFLIGHT_FD: "GET_INFLIGHT_FD", + REQ_SET_INFLIGHT_FD: "SET_INFLIGHT_FD", + REQ_GPU_SET_SOCKET: "GPU_SET_SOCKET", + REQ_RESET_DEVICE: "RESET_DEVICE", + REQ_GET_MAX_MEM_SLOTS: "GET_MAX_MEM_SLOTS", + REQ_ADD_MEM_REG: "ADD_MEM_REG", + REQ_REM_MEM_REG: "REM_MEM_REG", + REQ_SET_STATUS: "SET_STATUS", + REQ_GET_STATUS: "GET_STATUS", + REQ_GET_SHARED_OBJECT: "GET_SHARED_OBJECT", + REQ_SET_DEVICE_STATE_FD: "SET_DEVICE_STATE_FD", + REQ_CHECK_DEVICE_STATE: "CHECK_DEVICE_STATE", + REQ_MAX: "MAX", +} + +const ( + BACKEND_REQ_NONE = 0 + BACKEND_REQ_IOTLB_MSG = 1 + BACKEND_REQ_CONFIG_CHANGE_MSG = 2 + BACKEND_REQ_VRING_HOST_NOTIFIER_MSG = 3 + BACKEND_REQ_SHARED_OBJECT_ADD = 6 + BACKEND_REQ_SHARED_OBJECT_REMOVE = 7 + BACKEND_REQ_SHARED_OBJECT_LOOKUP = 8 + BACKEND_REQ_MAX = 9 +) + +const ( + VHOST_MEMORY_BASELINE_NREGIONS = 8 + BACKEND_MAX_FDS = 8 + MAX_CONFIG_SIZE = 256 +) + +type GetFeaturesReply struct { + Mask uint64 +} + +var decodeIn = map[uint32]func(unsafe.Pointer) interface{}{ + REQ_ADD_MEM_REG: func(p unsafe.Pointer) interface{} { return (*VhostUserMemRegMsg)(p) }, + REQ_SET_FEATURES: func(p unsafe.Pointer) interface{} { return (*SetFeaturesRequest)(p) }, + REQ_SET_PROTOCOL_FEATURES: func(p unsafe.Pointer) interface{} { return (*SetProtocolFeaturesRequest)(p) }, + REQ_SET_VRING_ADDR: func(p unsafe.Pointer) interface{} { return (*VhostVringAddr)(p) }, + REQ_SET_VRING_BASE: func(p unsafe.Pointer) interface{} { return (*VhostVringState)(p) }, + REQ_SET_VRING_CALL: func(p unsafe.Pointer) interface{} { return (*U64Payload)(p) }, + REQ_SET_VRING_ENABLE: func(p unsafe.Pointer) interface{} { return (*VhostVringState)(p) }, + REQ_SET_VRING_ERR: func(p unsafe.Pointer) interface{} { return (*U64Payload)(p) }, + REQ_SET_VRING_KICK: func(p unsafe.Pointer) interface{} { return (*U64Payload)(p) }, + REQ_SET_VRING_NUM: func(p unsafe.Pointer) interface{} { return (*VhostVringState)(p) }, + REQ_SET_LOG_BASE: func(p unsafe.Pointer) interface{} { return (*VhostUserLog)(p) }, +} + +var decodeOut = map[uint32]func(unsafe.Pointer) interface{}{ + REQ_GET_FEATURES: func(p unsafe.Pointer) interface{} { return (*GetFeaturesReply)(p) }, + REQ_GET_PROTOCOL_FEATURES: func(p unsafe.Pointer) interface{} { return (*GetProtocolFeaturesReply)(p) }, +} + +var inFDCount = map[uint32]int{ + REQ_SET_BACKEND_REQ_FD: 1, + REQ_SET_VRING_CALL: 1, + REQ_SET_VRING_ERR: 1, + REQ_ADD_MEM_REG: 1, + REQ_SET_VRING_KICK: 1, + REQ_SET_LOG_BASE: 1, +} + +func (r *GetFeaturesReply) String() string { + return fmt.Sprintf("{%s}", + maskToString(featureNames, r.Mask)) +} + +type SetFeaturesRequest struct { + Mask uint64 +} + +func (r *SetFeaturesRequest) String() string { + return fmt.Sprintf("{%s}", + maskToString(featureNames, r.Mask)) +} + +type GetProtocolFeaturesReply struct { + Mask uint64 +} + +func (r *GetProtocolFeaturesReply) String() string { + return fmt.Sprintf("{%s}", + maskToString(protocolFeatureNames, r.Mask)) +} + +type SetProtocolFeaturesRequest struct { + Mask uint64 +} + +func (r *SetProtocolFeaturesRequest) String() string { + return fmt.Sprintf("{%s}", + maskToString(protocolFeatureNames, r.Mask)) +} + +type U64Payload struct { + Num uint64 +} + +func (p *U64Payload) String() string { + return fmt.Sprintf("{%d}", p.Num) +} + +/* +typedef union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + u64 uint64 + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserMemRegMsg mem_reg; + VhostUserLog log; + struct vhost_iotlb_msg iotlb; + VhostUserConfig config; + VhostUserCryptoSession session; + VhostUserVringArea area; + VhostUserInflight inflight; + VhostUserShared object; + VhostUserTransferDeviceState transfer_state; +} VhostUserPayload; +*/ + +type VhostVringState struct { + Index uint32 + Num uint32 // unsigned int? +} + +func (s *VhostVringState) String() string { + return fmt.Sprintf("idx %d num %d", s.Index, s.Num) +} + +type VhostVringAddr struct { + Index uint32 + /* Option flags. */ + Flags uint32 + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ + //#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + DescUserAddr uint64 + /* Used structure address. Must be 32 bit aligned */ + UsedUserAddr uint64 + /* Available structure address. Must be 16 bit aligned */ + AvailUserAddr uint64 + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. */ + LogGuestAddr uint64 +} + +func (a *VhostVringAddr) String() string { + return fmt.Sprintf("idx %d flags %x Desc %x Used %x Avail %x LogGuest %x", + a.Index, a.Flags, a.DescUserAddr, a.UsedUserAddr, + a.AvailUserAddr, a.LogGuestAddr) +} + +// virtio_ring.h + +// must be aligned on 4 bytes, but that's automatic? +type VringUsedElement struct { + ID uint32 + Len uint32 +} + +func (ue *VringUsedElement) String() string { + return fmt.Sprintf("{id: %d len: %d}", ue.ID, ue.Len) +} + +// aligned 4 bytes +type VringUsed struct { + Flags uint16 + Idx uint16 + Ring0 VringUsedElement +} + +// qemu:include/standard-headers/linux/virtio_ring.h +const ( + /* This marks a buffer as continuing via the next field. */ + VRING_DESC_F_NEXT = 1 + /* This marks a buffer as write-only (otherwise read-only). */ + VRING_DESC_F_WRITE = 2 + /* This means the buffer contains a list of buffer descriptors. */ + VRING_DESC_F_INDIRECT = 4 +) + +var vringDescNames = map[int]string{ + 0: "NEXT", + 1: "WRITE", + 2: "INDIRECT", +} + +// Aligned 16 byte +type VringDesc struct { + Addr uint64 + Len uint32 + Flags uint16 + Next uint16 +} + +func (d VringDesc) String() string { + return fmt.Sprintf("[0x%x,+0x%x) %s next %d", d.Addr, d.Len, maskToString(vringDescNames, uint64(d.Flags)), d.Next) +} + +// aligned on 2 bytes +type VringAvail struct { + Flags uint16 + Idx uint16 + Ring0 uint16 +} + +type VhostUserMemoryRegion struct { + GuestPhysAddr uint64 + MemorySize uint64 + DriverAddr uint64 + MmapOffset uint64 +} + +func (r *VhostUserMemoryRegion) String() string { + return fmt.Sprintf("Guest [0x%x,+0x%x) Driver %x MmapOff %x", + r.GuestPhysAddr, r.MemorySize, r.DriverAddr, r.MmapOffset) +} + +type VhostUserMemory struct { + Nregions uint32 + Padding uint32 + Regions [VHOST_MEMORY_BASELINE_NREGIONS]VhostUserMemoryRegion +} + +type VhostUserMemRegMsg struct { + Padding uint64 + Region VhostUserMemoryRegion +} + +type VhostUserLog struct { + MmapSize uint64 + MmapOffset uint64 +} + +func (l *VhostUserLog) String() string { + return fmt.Sprintf("[0x%x,+0x%x)", l.MmapSize, l.MmapOffset) +} + +type VhostUserConfig struct { + Offset uint32 + Size uint32 + Flags uint32 + Region [MAX_CONFIG_SIZE]uint8 +} + +/* +#define VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN 512 +#define VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN 64 +#define VHOST_CRYPTO_ASYM_MAX_KEY_LEN 1024 + +type VhostUserCryptoSession struct { + OpCode uint64 + union { + struct { + CryptoDevBackendSymSessionInfo session_setup_data; + key[VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN] uint8 + auth_key[VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN] uint8 + } sym; + struct { + CryptoDevBackendAsymSessionInfo session_setup_data; + key[VHOST_CRYPTO_ASYM_MAX_KEY_LEN] uint8 + } asym; + } u; + + /* session id for success, -1 on errors * / + SessionId int64 +} ; +*/ + +type VhostUserVringArea struct { + U64 uint64 + Size uint64 + Offset uint64 +} + +type VhostUserInflight struct { + MmapSize uint64 + MmapOffset uint64 + NumQueues uint16 + QueueSize uint16 +} + +type VhostUserShared struct { + Uuid [16]byte +} + +type Header struct { + Request uint32 + /* + #define VHOST_USER_VERSION_MASK (0x3) + #define VHOST_USER_REPLY_MASK (0x1 << 2) + #define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + */ + Flags uint32 + /* the following payload size */ + Size uint32 +} + +/* Request payload of VHOST_USER_SET_DEVICE_STATE_FD */ +type VhostUserTransferDeviceState struct { + Direction uint32 + Phase uint32 +} + +/* no alignment requirement */ +type VhostIotlbMsg struct { + Iova uint64 + Size uint64 + Uaddr uint64 + /* + #define VHOST_ACCESS_RO 0x1 + #define VHOST_ACCESS_WO 0x2 + #define VHOST_ACCESS_RW 0x3 + */ + Perm uint8 + /* + #define VHOST_IOTLB_MISS 1 + #define VHOST_IOTLB_UPDATE 2 + #define VHOST_IOTLB_INVALIDATE 3 + #define VHOST_IOTLB_ACCESS_FAIL 4 + */ + /* + * VHOST_IOTLB_BATCH_BEGIN and VHOST_IOTLB_BATCH_END allow modifying + * multiple mappings in one go: beginning with + * VHOST_IOTLB_BATCH_BEGIN, followed by any number of + * VHOST_IOTLB_UPDATE messages, and ending with VHOST_IOTLB_BATCH_END. + * When one of these two values is used as the message type, the rest + * of the fields in the message are ignored. There's no guarantee that + * these changes take place automatically in the device. + */ + /* + #define VHOST_IOTLB_BATCH_BEGIN 5 + #define VHOST_IOTLB_BATCH_END 6 + */ + Type uint8 +}