Skip to content
Snippets Groups Projects
virtio-net.cc 20.5 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Copyright (C) 2013 Cloudius Systems, Ltd.
     *
     * This work is open source software, licensed under the terms of the
     * BSD license as described in the LICENSE file in the top-level directory.
     */
    
    
    #include <sys/cdefs.h>
    
    
    Guy Zana's avatar
    Guy Zana committed
    #include "drivers/virtio.hh"
    #include "drivers/virtio-net.hh"
    
    #include "drivers/pci-device.hh"
    
    #include "interrupt.hh"
    
    #include "mempool.hh"
    #include "mmu.hh"
    
    #include <string>
    #include <string.h>
    #include <map>
    #include <errno.h>
    
    #include <osv/debug.h>
    
    #include "sched.hh"
    
    #include "osv/trace.hh"
    
    #include "drivers/clock.hh"
    #include "drivers/clockevent.hh"
    
    #include <osv/device.h>
    
    Guy Zana's avatar
    Guy Zana committed
    #include <osv/ioctl.h>
    
    #include <bsd/sys/net/ethernet.h>
    #include <bsd/sys/net/if_types.h>
    
    Dor Laor's avatar
    Dor Laor committed
    #include <bsd/sys/sys/param.h>
    
    #include <bsd/sys/net/ethernet.h>
    #include <bsd/sys/net/if_vlan_var.h>
    #include <bsd/sys/netinet/in.h>
    #include <bsd/sys/netinet/ip.h>
    #include <bsd/sys/netinet/udp.h>
    #include <bsd/sys/netinet/tcp.h>
    
    TRACEPOINT(trace_virtio_net_rx_packet, "if=%d, len=%d", int, int);
    TRACEPOINT(trace_virtio_net_rx_wake, "");
    
    TRACEPOINT(trace_virtio_net_fill_rx_ring, "if=%d", int);
    TRACEPOINT(trace_virtio_net_fill_rx_ring_added, "if=%d, added=%d", int, int);
    
    TRACEPOINT(trace_virtio_net_tx_packet, "if=%d, len=%d", int, int);
    
    TRACEPOINT(trace_virtio_net_tx_failed_add_buf, "if=%d", int);
    TRACEPOINT(trace_virtio_net_tx_no_space_calling_gc, "if=%d", int);
    
    using namespace memory;
    
    // TODO list
    // irq thread affinity and tx affinity
    // tx zero copy
    // vlans?
    
    Guy Zana's avatar
    Guy Zana committed
    
    namespace virtio {
    
    
        int virtio_net::_instance = 0;
    
        #define virtio_net_tag "virtio-net"
    
        #define virtio_net_d(...)   tprintf_d(virtio_net_tag, __VA_ARGS__)
        #define virtio_net_i(...)   tprintf_i(virtio_net_tag, __VA_ARGS__)
        #define virtio_net_w(...)   tprintf_w(virtio_net_tag, __VA_ARGS__)
        #define virtio_net_e(...)   tprintf_e(virtio_net_tag, __VA_ARGS__)
    
    
        static int virtio_if_ioctl(
                struct ifnet *ifp,
                u_long command,
                caddr_t data)
        {
    
            virtio_net_d("virtio_if_ioctl %x", command);
    
    
            int error = 0;
            switch(command) {
            case SIOCSIFMTU:
    
                virtio_net_d("SIOCSIFMTU");
    
                virtio_net_d("SIOCSIFFLAGS");
    
                /* Change status ifup, ifdown */
                if (ifp->if_flags & IFF_UP) {
                    ifp->if_drv_flags |= IFF_DRV_RUNNING;
    
                    virtio_net_d("if_up");
    
                } else {
                    ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
    
                    virtio_net_d("if_down");
    
                virtio_net_d("SIOCDELMULTI");
    
                virtio_net_d("redirecting to ether_ioctl()...");
    
                error = ether_ioctl(ifp, command, data);
                break;
            }
    
            return(error);
        }
    
    
        /**
         * Invalidate the local Tx queues.
         * @param ifp upper layer instance handle
         */
        static void virtio_if_qflush(struct ifnet *ifp)
        {
            /*
             * Since virtio_net currently doesn't have any Tx queue we just
             * flush the upper layer queues.
             */
            if_qflush(ifp);
        }
    
        /**
         * Transmits a single mbuf instance.
         * @param ifp upper layer instance handle
         * @param m_head mbuf to transmit
         *
         * @return 0 in case of success and an appropriate error code
         *         otherwise
         */
        static int virtio_if_transmit(struct ifnet* ifp, struct mbuf* m_head)
    
        {
            virtio_net* vnet = (virtio_net*)ifp->if_softc;
    
    
            virtio_net_d("%s_start", __FUNCTION__);
    
            virtio_net_d("*** processing packet! ***");
    
            int error = vnet->tx_locked(m_head);
    
            if (!error)
                vnet->kick(1);
    
            virtio_net_d("Virtio-net init");
    
        virtio_net::virtio_net(pci::device& dev)
    
            : virtio_driver(dev),
              _rxq(get_virt_queue(0), [this] { this->receiver(); }),
              _txq(get_virt_queue(1))
    
    Guy Zana's avatar
    Guy Zana committed
        {
    
            sched::thread* poll_task = &_rxq.poll_task;
    
            _driver_name = "virtio-net";
    
            virtio_i("VIRTIO NET INSTANCE");
    
            _id = _instance++;
    
            setup_features();
    
            _hdr_size = (_mergeable_bufs)? sizeof(virtio_net_hdr_mrg_rxbuf):sizeof(virtio_net_hdr);
    
    
            //initialize the BSD interface _if
            _ifn = if_alloc(IFT_ETHER);
            if (_ifn == NULL) {
               //FIXME: need to handle this case - expand the above function not to allocate memory and
               // do it within the constructor.
    
               virtio_net_w("if_alloc failed!");
    
            if_initname(_ifn, "eth", _id);
    
            _ifn->if_mtu = ETHERMTU;
            _ifn->if_softc = static_cast<void*>(this);
            _ifn->if_flags = IFF_BROADCAST /*| IFF_MULTICAST*/;
            _ifn->if_ioctl = virtio_if_ioctl;
    
            _ifn->if_transmit = virtio_if_transmit;
            _ifn->if_qflush = virtio_if_qflush;
    
            IFQ_SET_MAXLEN(&_ifn->if_snd, _txq.vqueue->size());
    
    Dor Laor's avatar
    Dor Laor committed
    
            _ifn->if_capabilities = 0;
    
            if (_csum) {
                _ifn->if_capabilities |= IFCAP_TXCSUM;
    
                if (_host_tso4) {
    
    Dor Laor's avatar
    Dor Laor committed
                    _ifn->if_capabilities |= IFCAP_TSO4;
    
                    _ifn->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
                }
    
    Dor Laor's avatar
    Dor Laor committed
            }
    
            if (_guest_csum) {
                _ifn->if_capabilities |= IFCAP_RXCSUM;
                if (_guest_tso4)
                    _ifn->if_capabilities |= IFCAP_LRO;
            }
    
    
            _ifn->if_capenable = _ifn->if_capabilities | IFCAP_HWSTATS;
    
            //Start the polling thread before attaching it to the Rx interrupt
            poll_task->start();
    
    
                { 0, [&] { _rxq.vqueue->disable_interrupts(); }, poll_task },
                { 1, [&] { _txq.vqueue->disable_interrupts(); }, nullptr }
    
            add_dev_status(VIRTIO_CONFIG_S_DRIVER_OK);
    
        virtio_net::~virtio_net()
    
            //TODO: In theory maintain the list of free instances and gc it
            // including the thread objects and their stack
    
            // Will need to clear the pending requests in the ring too
    
            // TODO: add a proper cleanup for a rx.poll_task() here.
            //
            // Since this will involve the rework of the virtio layer - make it for
            // all virtio drivers in a separate patchset.
    
    
        bool virtio_net::read_config()
        {
            //read all of the net config  in one shot
    
            virtio_conf_read(virtio_pci_config_offset(), &_config, sizeof(_config));
    
            if (get_guest_feature_bit(VIRTIO_NET_F_MAC))
    
                virtio_net_i("The mac addr of the device is %x:%x:%x:%x:%x:%x",
                        (u32)_config.mac[0],
                        (u32)_config.mac[1],
                        (u32)_config.mac[2],
                        (u32)_config.mac[3],
                        (u32)_config.mac[4],
    
                        (u32)_config.mac[5]);
    
    
            _mergeable_bufs = get_guest_feature_bit(VIRTIO_NET_F_MRG_RXBUF);
    
    Dor Laor's avatar
    Dor Laor committed
            _status = get_guest_feature_bit(VIRTIO_NET_F_STATUS);
            _tso_ecn = get_guest_feature_bit(VIRTIO_NET_F_GUEST_ECN);
            _host_tso_ecn = get_guest_feature_bit(VIRTIO_NET_F_HOST_ECN);
            _csum = get_guest_feature_bit(VIRTIO_NET_F_CSUM);
            _guest_csum = get_guest_feature_bit(VIRTIO_NET_F_GUEST_CSUM);
            _guest_tso4 = get_guest_feature_bit(VIRTIO_NET_F_GUEST_TSO4);
            _host_tso4 = get_guest_feature_bit(VIRTIO_NET_F_HOST_TSO4);
    
            virtio_net_i("Features: %s=%d,%s=%d", "Status", _status, "TSO_ECN", _tso_ecn);
            virtio_net_i("Features: %s=%d,%s=%d", "Host TSO ECN", _host_tso_ecn, "CSUM", _csum);
            virtio_net_i("Features: %s=%d,%s=%d", "Guest_csum", _guest_csum, "guest tso4", _guest_tso4);
            virtio_net_i("Features: %s=%d", "host tso4", _host_tso4);
    
    Dor Laor's avatar
    Dor Laor committed
         * Original comment from FreeBSD
         * Alternative method of doing receive checksum offloading. Rather
         * than parsing the received frame down to the IP header, use the
         * csum_offset to determine which CSUM_* flags are appropriate. We
         * can get by with doing this only because the checksum offsets are
         * unique for the things we care about.
    
         *
         * @return true if csum is bad and false if csum is ok (!!!)
    
        bool virtio_net::bad_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
    
    Dor Laor's avatar
    Dor Laor committed
        {
            struct ether_header *eh;
            struct ether_vlan_header *evh;
            struct udphdr *udp;
            int csum_len;
            u16 eth_type;
    
            csum_len = hdr->csum_start + hdr->csum_offset;
    
            if (csum_len < (int)sizeof(struct ether_header) + (int)sizeof(struct ip))
                return true;
    
            if (m->m_hdr.mh_len < csum_len)
    
    Dor Laor's avatar
    Dor Laor committed
                return true;
    
            eh = mtod(m, struct ether_header *);
            eth_type = ntohs(eh->ether_type);
            if (eth_type == ETHERTYPE_VLAN) {
                evh = mtod(m, struct ether_vlan_header *);
                eth_type = ntohs(evh->evl_proto);
            }
    
    
            // How come - no support for IPv6?!
    
    Dor Laor's avatar
    Dor Laor committed
            if (eth_type != ETHERTYPE_IP) {
                return true;
            }
    
            /* Use the offset to determine the appropriate CSUM_* flags. */
            switch (hdr->csum_offset) {
            case offsetof(struct udphdr, uh_sum):
    
                if (m->m_hdr.mh_len < hdr->csum_start + (int)sizeof(struct udphdr))
    
    Dor Laor's avatar
    Dor Laor committed
                    return true;
                udp = (struct udphdr *)(mtod(m, uint8_t *) + hdr->csum_start);
                if (udp->uh_sum == 0)
                    return false;
    
                /* FALLTHROUGH */
    
            case offsetof(struct tcphdr, th_sum):
    
                m->M_dat.MH.MH_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
                m->M_dat.MH.MH_pkthdr.csum_data = 0xFFFF;
    
    Dor Laor's avatar
    Dor Laor committed
                break;
    
            default:
                return true;
            }
    
            return false;
        }
    
    
        void virtio_net::receiver()
        {
            vring* vq = _rxq.vqueue;
    
    
                // Wait for rx queue (used elements)
    
                virtio_driver::wait_for_queue(vq, &vring::used_ring_not_empty);
    
                trace_virtio_net_rx_wake();
    
    
                u32 len;
                int nbufs;
    
                struct mbuf* m = static_cast<struct mbuf*>(vq->get_buf_elem(&len));
    
                u32 offset = _hdr_size;
    
                u64 rx_drops = 0, rx_packets = 0, csum_ok = 0;
                u64 csum_err = 0, rx_bytes = 0;
    
    
                // use local header that we copy out of the mbuf since we're
                // truncating it.
    
                struct virtio_net_hdr_mrg_rxbuf mhdr;
    
    
                while (m != nullptr) {
    
    
                    // TODO: should get out of the loop
    
                    vq->get_buf_finalize();
    
                    // Bad packet/buffer - discard and continue to the next one
    
                    if (len < _hdr_size + ETHER_HDR_LEN) {
    
    
                        m = static_cast<struct mbuf*>(vq->get_buf_elem(&len));
    
                        continue;
                    }
    
                    memcpy(&mhdr, mtod(m, void *), _hdr_size);
    
                    if (!_mergeable_bufs) {
                        nbufs = 1;
                    } else {
                        nbufs = mhdr.num_buffers;
                    }
    
                    m->M_dat.MH.MH_pkthdr.len = len;
                    m->M_dat.MH.MH_pkthdr.rcvif = _ifn;
                    m->M_dat.MH.MH_pkthdr.csum_flags = 0;
                    m->m_hdr.mh_len = len;
    
                    struct mbuf* m_head, *m_tail;
                    m_tail = m_head = m;
    
    
                    // Read the fragments
    
                    while (--nbufs > 0) {
    
                        m = static_cast<struct mbuf*>(vq->get_buf_elem(&len));
                        if (m == nullptr) {
    
                        vq->get_buf_finalize();
    
                        if (m->m_hdr.mh_len < (int)len)
                            len = m->m_hdr.mh_len;
    
                        m->m_hdr.mh_len = len;
                        m->m_hdr.mh_flags &= ~M_PKTHDR;
                        m_head->M_dat.MH.MH_pkthdr.len += len;
                        m_tail->m_hdr.mh_next = m;
    
                    // skip over the virtio header bytes (offset)
                    // that aren't need for the above layer
    
                    m_adj(m_head, offset);
    
    
                    if ((_ifn->if_capenable & IFCAP_RXCSUM) &&
                        (mhdr.hdr.flags &
    
                         virtio_net_hdr::VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
                        if (bad_rx_csum(m_head, &mhdr.hdr))
                            csum_err++;
                        else
                            csum_ok++;
    
                    }
    
                    rx_packets++;
                    rx_bytes += m_head->M_dat.MH.MH_pkthdr.len;
    
                    (*_ifn->if_input)(_ifn, m_head);
    
    
                    trace_virtio_net_rx_packet(_ifn->if_index, len);
    
    
                    // The interface may have been stopped while we were
                    // passing the packet up the network stack.
                    if ((_ifn->if_drv_flags & IFF_DRV_RUNNING) == 0)
                        break;
    
    
                    // Move to the next packet
                    m = static_cast<struct mbuf*>(vq->get_buf_elem(&len));
    
                if (vq->refill_ring_cond())
    
    
                // Update the stats
                _rxq.stats.rx_drops      += rx_drops;
                _rxq.stats.rx_packets    += rx_packets;
                _rxq.stats.rx_csum       += csum_ok;
                _rxq.stats.rx_csum_err   += csum_err;
                _rxq.stats.rx_bytes      += rx_bytes;
    
            }
        }
    
        void virtio_net::fill_rx_ring()
        {
    
            trace_virtio_net_fill_rx_ring(_ifn->if_index);
            int added = 0;
    
            vring* vq = _rxq.vqueue;
    
            while (vq->avail_ring_not_empty()) {
    
    Dor Laor's avatar
    Dor Laor committed
                struct mbuf *m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MCLBYTES);
    
                m->m_hdr.mh_len = MCLBYTES;
    
                u8 *mdata = mtod(m, u8*);
    
                vq->init_sg();
                vq->add_in_sg(mdata, m->m_hdr.mh_len);
                if (!vq->add_buf(m)) {
    
            trace_virtio_net_fill_rx_ring_added(_ifn->if_index, added);
    
    
            if (added)
                vq->kick();
    
        // TODO: Does it really have to be "locked"?
    
        int virtio_net::tx_locked(struct mbuf *m_head, bool flush)
    
            DEBUG_ASSERT(_tx_ring_lock.owned(), "_tx_ring_lock is not locked!");
    
    
            virtio_net_req *req = new virtio_net_req;
    
            vring* vq = _txq.vqueue;
            auto vq_sg_vec = &vq->_sg_vec;
    
            int rc = 0;
            struct vnet_txq_stats* stats = &_txq.stats;
            u64 tx_bytes = 0;
    
            req->um.reset(m_head);
    
            if (m_head->M_dat.MH.MH_pkthdr.csum_flags != 0) {
    
    Dor Laor's avatar
    Dor Laor committed
                m = tx_offload(m_head, &req->mhdr.hdr);
                if ((m_head = m) == nullptr) {
                    delete req;
    
                    /* The buffer is not well-formed */
    
                    rc = EINVAL;
                    goto out;
    
            vq->init_sg();
            vq->add_out_sg(static_cast<void*>(&req->mhdr), _hdr_size);
    
            for (m = m_head; m != NULL; m = m->m_hdr.mh_next) {
    
                int frag_len = m->m_hdr.mh_len;
    
                if (frag_len != 0) {
                    virtio_net_d("Frag len=%d:", frag_len);
    
                    req->mhdr.num_buffers++;
    
                    vq->add_out_sg(m->m_hdr.mh_data, m->m_hdr.mh_len);
    
                    tx_bytes += frag_len;
    
            if (!vq->avail_ring_has_room(vq->_sg_vec.size())) {
    
                // can't call it, this is a get buf thing
    
                if (vq->used_ring_not_empty()) {
    
                    trace_virtio_net_tx_no_space_calling_gc(_ifn->if_index);
    
                    virtio_net_d("%s: no room", __FUNCTION__);
    
            if (!vq->add_buf(req)) {
    
                trace_virtio_net_tx_failed_add_buf(_ifn->if_index);
    
            trace_virtio_net_tx_packet(_ifn->if_index, vq_sg_vec->size());
    
        out:
    
            /* Update the statistics */
            switch (rc) {
            case 0: /* success */
                stats->tx_bytes += tx_bytes;
                stats->tx_packets++;
    
                if (req->mhdr.hdr.flags & virtio_net_hdr::VIRTIO_NET_HDR_F_NEEDS_CSUM)
                    stats->tx_csum++;
    
                if (req->mhdr.hdr.gso_type)
                    stats->tx_tso++;
    
                break;
            case ENOBUFS:
                stats->tx_drops++;
    
                break;
            default:
                stats->tx_err++;
            }
    
            return rc;
    
    Dor Laor's avatar
    Dor Laor committed
        struct mbuf*
        virtio_net::tx_offload(struct mbuf* m, struct virtio_net_hdr* hdr)
        {
            struct ether_header *eh;
            struct ether_vlan_header *evh;
            struct ip *ip;
            struct tcphdr *tcp;
            int ip_offset;
            u16 eth_type, csum_start;
    
            u8 ip_proto, gso_type = 0;
    
    Dor Laor's avatar
    Dor Laor committed
    
            ip_offset = sizeof(struct ether_header);
    
            if (m->m_hdr.mh_len < ip_offset) {
    
    Dor Laor's avatar
    Dor Laor committed
                if ((m = m_pullup(m, ip_offset)) == nullptr)
                    return nullptr;
            }
    
            eh = mtod(m, struct ether_header *);
            eth_type = ntohs(eh->ether_type);
            if (eth_type == ETHERTYPE_VLAN) {
                ip_offset = sizeof(struct ether_vlan_header);
    
                if (m->m_hdr.mh_len < ip_offset) {
    
    Dor Laor's avatar
    Dor Laor committed
                    if ((m = m_pullup(m, ip_offset)) == nullptr)
                        return nullptr;
                }
                evh = mtod(m, struct ether_vlan_header *);
                eth_type = ntohs(evh->evl_proto);
            }
    
            switch (eth_type) {
            case ETHERTYPE_IP:
    
                if (m->m_hdr.mh_len < ip_offset + (int)sizeof(struct ip)) {
    
    Dor Laor's avatar
    Dor Laor committed
                    m = m_pullup(m, ip_offset + sizeof(struct ip));
                    if (m == nullptr)
                        return nullptr;
                }
    
                ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
                ip_proto = ip->ip_p;
                csum_start = ip_offset + (ip->ip_hl << 2);
                gso_type = virtio_net::virtio_net_hdr::VIRTIO_NET_HDR_GSO_TCPV4;
                break;
    
            default:
                return m;
            }
    
    
            if (m->M_dat.MH.MH_pkthdr.csum_flags & VIRTIO_NET_CSUM_OFFLOAD) {
    
    Dor Laor's avatar
    Dor Laor committed
                hdr->flags |= virtio_net_hdr::VIRTIO_NET_HDR_F_NEEDS_CSUM;
                hdr->csum_start = csum_start;
    
                hdr->csum_offset = m->M_dat.MH.MH_pkthdr.csum_data;
    
            if (m->M_dat.MH.MH_pkthdr.csum_flags & CSUM_TSO) {
    
    Dor Laor's avatar
    Dor Laor committed
                if (ip_proto != IPPROTO_TCP)
                    return m;
    
    
                if (m->m_hdr.mh_len < csum_start + (int)sizeof(struct tcphdr)) {
    
    Dor Laor's avatar
    Dor Laor committed
                    m = m_pullup(m, csum_start + sizeof(struct tcphdr));
                    if (m == nullptr)
                        return nullptr;
                }
    
                tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start);
                hdr->gso_type = gso_type;
                hdr->hdr_len = csum_start + (tcp->th_off << 2);
    
                hdr->gso_size = m->M_dat.MH.MH_pkthdr.tso_segsz;
    
    Dor Laor's avatar
    Dor Laor committed
    
                if (tcp->th_flags & TH_CWR) {
                    if (!_tso_ecn) {
                        virtio_w("TSO with ECN not supported by host\n");
                        m_freem(m);
                        return nullptr;
                    }
    
                    hdr->flags |= virtio_net_hdr::VIRTIO_NET_HDR_GSO_ECN;
                }
            }
    
            return m;
        }
    
    
            virtio_net_req * req;
            u32 len;
    
            vring* vq = _txq.vqueue;
    
            req = static_cast<virtio_net_req*>(vq->get_buf_elem(&len));
    
            while(req != nullptr) {
    
                delete req;
    
                vq->get_buf_finalize();
    
                req = static_cast<virtio_net_req*>(vq->get_buf_elem(&len));
    
            vq->get_buf_gc();
    
        u32 virtio_net::get_driver_features(void)
        {
            u32 base = virtio_driver::get_driver_features();
    
    Dor Laor's avatar
    Dor Laor committed
            return (base | (1 << VIRTIO_NET_F_MAC)        \
                         | (1 << VIRTIO_NET_F_MRG_RXBUF)  \
                         | (1 << VIRTIO_NET_F_STATUS)     \
                         | (1 << VIRTIO_NET_F_CSUM)       \
                         | (1 << VIRTIO_NET_F_GUEST_CSUM) \
                         | (1 << VIRTIO_NET_F_GUEST_TSO4) \
                         | (1 << VIRTIO_NET_F_HOST_ECN)   \
                         | (1 << VIRTIO_NET_F_HOST_TSO4)  \
                         | (1 << VIRTIO_NET_F_GUEST_ECN)  \
                         | (1 << VIRTIO_RING_F_INDIRECT_DESC));
    
        hw_driver* virtio_net::probe(hw_device* dev)
        {
    
            return virtio::probe<virtio_net, VIRTIO_NET_DEVICE_ID>(dev);
    
    Dor Laor's avatar
    Dor Laor committed