Virtio Network

Virtio note

virtio-net qemu device

from QEMU 2.9 source code.

MMIO and virtqueue_kick()

virtqueue_kick() is a MMIO operation, and it triggers virtio-device to output data from driver. Output interface is VirtQueue->handle_output() or VirtQueue->handle_aio_output(). Virtio-net use output interface to send packet (TX path).

// virtio_mmio_write() => virtio_queue_notify() => vq->{handle_output,handle_aio_output}()

// pseudocode
void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, unsigned size)
    VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);

    if offset == VIRTIO_MMIO_QUEUE_NOTIFY:
        vdev->vq[value]->handle_output(vdev, vdev->vq[value])

// handle_output interface
void handle_output(VirtIODevice *, VirtQueue *);
bool handle_aio_output(VirtIODevice *, VirtQueue *);

virtio-mmio device handles MMIO by VirtIOMMIOProxy’s virtio_mmio_write().

virtio driver does virtqueue_kick() when MMIO offset(guest address) is VIRTIO_MMIO_QUEUE_NOTIFY. device handles virtqueue_kick() by virtio_queue_notify().

In virtio_queue_notify(), device handles output of one virtqueue. MMIO value assigns which virtqueue should be used. each virtqueue has it’s handle_output() or handle_aio_output() callback.

virtio device can use virtio_add_queue() to set VirtQueue->handle_output, and use virtio_queue_aio_set_host_notifier_handler() to set VirtQueue->handle_aio_output.

Virtio TX

virtio-net handles TX by timer or QEMU BH. We use BH as example:

void virtio_net_add_queue(VirtIONet *n, int index){
    if( ... ){
        n->vqs[index].tx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_tx_timer);
        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer, &n->vqs[index]);
    }
    else{
        n->vqs[index].tx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_tx_bh);
        n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
    }
}
../../_images/virtio.png

virtio_net_handle_tx_bh() handles MMIO, and async calls BH.

Async BH callback virtio_net_tx_bh():

virtio_net_tx_bh() => virtio_net_flush_tx()

real TX virtio_net_flush_tx()

  • guest driver 傳輸的資料放在 VirtQueueq->tx_vq

  • virtio device 每次會拿一個 VirtQueue 的 element 來做傳輸.

    • virtqueue_pop(): 取一個 element

    • emulate device 傳輸: 呼叫 qemu_send_packet() 系列 API.

  • completion interrupt

  • net_hdr_swap, host_hdr_len

Virtio API

TYPE_VIRTIO_NET => TYPE_VIRTIO_DEVICE => TYPE_DEVICE

- VirtIONet => VirtIODevice => DeviceState
- X => VirtioDeviceClass => DeviceClass

struct VirtIONet {
    VirtIODevice parent_obj

    VirtIONetQueue *vqs
    VirtQueue *ctrl_vq

    virtio_net_conf net_conf
    NICState *nic
    NICConf nic_conf
}

struct VirtIONetQueue {
    VirtQueue *rx_vq
    VirtQueue *tx_vq

    QEMUTimer *tx_timer
    QEMUBH *tx_bh
}

VirtIODevice, VirtQueue and VRing:

hw/virtio/virtio.c
include/hw/virtio/virtio.h

struct VirtIODevice {
    VirtQueue *vq
    uint8_t isr
}

struct VirtQueue {
    VRing *vring
    VirtIOHandleOutput
    VirtIOHandleAIOOutput
    EventNotifier guest_notifier, host_notifier;
}

struct VRing {
    hwaddr desc, avail, used
    VRingMemoryRegionCaches *caches // rcu + 3 MemoryRegionCache (desc, avail, used)
}

virtqueue API:

elem = virtqueue_alloc_element(sz, out_num, in_num);
virtqueue_detach_element(vq, elem, len);

elem = virtqueue_pop()
virtqueue_unpop(q->rx_vq, elem, total);
virtqueue_fill(): vring_used_write()
virtqueue_flush(): vq->used_idx += count
virtqueue_push():  virtqueue_fill() 一個 element, 然後 virtqueue_flush() 1 .

virtio_notify(vdev, q->rx_vq): 透過 qemu_set_irq() inject VIRQ  guest
virtio_queue_set_notification(): 開關 VRingUsed  interrupt suppression.
virtqueue_read_next_desc()

virtqueue_map_desc()
//
elem = virtqueue_pop()                      => dma_memory_map()   => address_space_map()
virtqueue_unpop(q->rx_vq, elem, total);     => dma_memory_unmap() => address_space_unmap()

virtio_notify() has 2 backends:

virtio_notify()

- VirtIODevice->isr &= value
- k->notify(); where k (VirtioBusClass is parent bus of VirtIODevice

k->notify() has 2 kinds

- virtio_mmio_update_irq()
- virtio_pci_notify()

進入正題 virtqueue_pop():

// 一開始先從現在 VRing  head 讀出一個 descriptor
i = head;
vring_desc_read(vdev, &desc, desc_cache, i);

// indirect descriptor 的情況: 需要新的 MemoryRegionCache 以及從 VRing 再讀一個 descriptor
if (desc.flags & VRING_DESC_F_INDIRECT) {
    len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, desc.addr, desc.len, false);
    vring_desc_read(vdev, &desc, desc_cache, i);
}

// Collect all the descriptors (VRingDesc is a linked-list)
do {
    map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov, VIRTQUEUE_MAX_SIZE, false, desc.addr, desc.len);
    rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i);
} while(rc == VIRTQUEUE_READ_DESC_MORE)


//  VRing 複製出的資料包裝成 VirtQueueElement
elem = virtqueue_alloc_element(sz, out_num, in_num);
for(){
    elem->out_addr[i] = addr[i];
    elem->out_sg[i] = iov[i];
}
...

return elem;

VRingMemoryRegionCache:

virtio_init_region_cache(vdev, n):
  初始化 VDev 裡的第 n  VQ  VR  VRingMemoryRegionCaches  MemoryRegionCache
  使用 address_space_cache_init()
  初始化參數 VR.desc, get_desc_size(vdev), vdev->dma_as
virtio_queue_update_rings(): 放好 VR.{desc, avail, used}  hwaddr, 然後用 virtio_init_region_cache() 初始化 MemoryRegionCache

low level VRing access API:

vring_desc_read()
    translate endianness:
    read from MemoryRegionCache (addr=i*sizeof(VRingDesc), len=sizeof(VRingDesc))

vring_used_write()
    translate endianness: virtio_tswap32s()
    write a VRingUsedElem (uelem, sizeof(VRingUsedElem))
      to VRingMemoryRegionCaches (&caches->used, pa=offsetof(VRingUsed, ring[i]), len=sizeof(VRingUsedElem)),
      using address_space_write_cached()
    cache invalidate: address_space_cache_invalidate()

// 同類型 API
vring_desc_read(): 讀取 VR 的第 n  VRingDesc
vring_avail_ring(): 讀取 VR  VRingAvail 的第 i  ring
vring_avail_flags(): 讀取 VR  VRingAvail  flags
vring_avail_idx(): 讀取 VR  VRingAvail  idx
vring_get_used_event(): vring_avail_ring(vq, vq->vring.num);

vring_used_write(): 寫入 VR  VRingUsed 的第 i  VRingUsedElem
vring_used_idx(): 讀取 VR  VRingUsed  idx
vring_used_idx_set(): 寫入 ... (同上)
vring_used_flags_set_bit(vq, mask): VR's VRingUsed's flags |= mask
vring_used_flags_unset_bit(): 與上面相反

vring_set_avail_event()

virtio-access API

  • virtio_tswap<size>s() 使用 bswap() API 轉換 endianness

  • virtio_ld_phy(): 使用 address_space_read() 讀取 guest memory

  • virtio_ld_phy_cached(): 使用 address_space_read() 讀取 guest memory,並且有 MemoryRegionCache 的加速

files:

virtio-access.h: 定義對 VRing shared memory  access API
memory_ldst.inc.c (exec.c):
  address_space_ldl_internal() 實作相近 address_space_read()
  address_space_ldl_internal_cached() 實作相近 address_space_read(), 並使用 MemoryRegionCache 的版本.
bswap.h: 進行 big-endianness  little-endianness 之間的轉換

address_space APIs

address_space_cache_init()
address_space_read_cached()

iov series APIs

  • iov_copy():

    unsigned iov_copy(
        struct iovec *dst_iov, unsigned int dst_iov_cnt,
        const struct iovec *iov, unsigned int iov_cnt,
        size_t offset, size_t bytes)
    
    • copy from (start=iov+offset, len=bytes) to (start=dst_iov, len=bytes)

    • shallow copy => iov and dst_iov use same iov_base pointer

    • return copied iov_cnt

  • iov_to_buf():

    size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt,
        size_t offset, void *buf, size_t bytes)
    
    • copy from (start=iov+offset, len=bytes) to (start=buf, len=bytes)

    • deep copy

    • return copied length

virtio-net kernel driver

from Linux Kernel 3.14.

virtqueue, vring, and virtio_device

source code:

include/uapi/linux/virtio_ring.h: // struct vring
include/linux/virtio.h:           // struct virtqueue
driver/virtio/virtio_ring.c       // struct vring_virtqueue
  • include/uapi/linux/virtio_*.h are virtio ABIs between guest and host.

struct vring_virtqueue {
    struct virtqueue vq;
    struct vring vring;
    bool (*notify)(struct virtqueue *vq);
}
struct virtqueue {
    struct virtio_device *vdev;
}

// inheritance:
//   virtio_mmio_device -> virtio_device
//   virtio_pci_device -> virtio_device

// constructor
vring_new_virtqueue(..., bool (*notify)(struct virtqueue *), ... )

methods of virtqueue and vring

virtqueue methods:

virtqueue_add_*()
virtqueue_get_buf()
virtqueue_kick()
virtqueue_disable_cb()
virtqueue_enable_cb*()

virtqueue_kick() is MMIO:

virtqueue_kick()
- virtqueue_kick_prepare()
- virtqueue_notify()
  - vring_virtqueue->notify()

// vring_virtqueue->notify() = vm_notify() or vp_notify()
vm_notify(struct virtqueue *vq)
  virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
  writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);

vp_notify(struct virtqueue *vq)
  virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
  iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);

virtio net

struct virtnet_info {
    virtio_device *vdev;
    virtqueue *cvq;
    net_device *dev;

    send_queue *sq;
    receive_queue *rq;
}

struct send_queue {
    struct virtqueue *vq;
    struct scatterlist sg[MAX_SKB_FRAGS + 2];
}

struct receive_queue {
    struct virtqueue *vq;
    struct napi_struct napi;
    struct scatterlist sg[MAX_SKB_FRAGS + 2];
}

network driver netdev and NAPI:

net_device_ops virtnet_netdev {
    .ndo_start_xmit      = start_xmit
    .ndo_poll_controller = virtnet_netpoll   // napi_struct
}
netif_napi_add(..., virtnet_poll, ...)

transmit use virtqueue_add_outbuf() and virtqueue_kick():

netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
// virtnet_info *vi = netdev_priv(dev);
// send_queue *sq = &vi->sq[qnum];

- xmit_skb(struct send_queue *sq, struct sk_buff *skb)

  - virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);

- virtqueue_kick(sq->vq)

receive related:

// poll() use virtqueue_get_buf()
virtnet_poll(struct napi_struct *napi, int budget)
// struct receive_queue *rq = container_of(napi, struct receive_queue, napi);

- virtqueue_receive(rq, budget)

  - virtqueue_get_buf(rq->vq, &len);
  - receive_buf(vi, rq, buf, len);

- napi_complete_done(napi, received);
- napi_schedule(napi);
- virtqueue_poll(rq->vq, r);
- virtqueue_disable_cb(rq->vq);

// napi_schedule
napi_schedule() <= skb_recv_done() <= callbacks[rxq2vq(i)], virtio_device vdev->config->find_vqs(callbacks)
napi_schedule() <= virtnet_napi_poll(), virtnet_netpoll()

// napi enable/disable
virtnet_open(), virtnet_restore(), refill_work() =>
virtnet_napi_enable() => napi_enable() + napi_schedule()
// CONFIG_PM_SLEEP: virtnet_freeze()/virtnet_restore()
// INIT_DELAYED_WORK(&vi->refill, refill_work); // schedule_delayed_work()