接前一篇文章:
从 QEMU源码全解析 —— virtio(3) 到上一回 QEMU源码全解析 —— virtio(16) 花了十几个回目讲解了 virtio 设备的初始化,本回开始讲解virtio驱动的加载。
virtio驱动的加载
前文书讲了每一个virtio设备都有一个对应的virtio PCI代理设备,本回开始来分析 虚拟机 内部操作系统是如何加载virtio PCI代理设备和virtio设备驱动、以及如何与virtio设备通信的。由于virtio PCI代理设备的存在,PCI进行扫描的时候会扫描到这个设备,并且会调用相应驱动的probe函数。
virtio_pci_driver及其probe回调函数 在 Linux 内核源码/drivers/virtio/virtio_pci_common.c中,如下所示:
- virtio_pci_driver
- static struct pci_driver virtio_pci_driver = {
- .name = "virtio-pci",
- .id_table = virtio_pci_id_table,
- .probe = virtio_pci_probe,
- .remove = virtio_pci_remove,
- #ifdef CONFIG_PM_SLEEP
- .driver.pm = &virtio_pci_pm_ops,
- #endif
- .sriov_configure = virtio_pci_sriov_configure,
- };
-
- module_pci_driver(virtio_pci_driver);
- virtio_pci_driver的probe函数virtio_pci_probe
- static int virtio_pci_probe(struct pci_dev *pci_dev,
- const struct pci_device_id *id)
- {
- struct virtio_pci_device *vp_dev, *reg_dev = NULL;
- int rc;
-
- /* allocate our structure and fill it out */
- vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
- if (!vp_dev)
- return -ENOMEM;
-
- pci_set_drvdata(pci_dev, vp_dev);
- vp_dev->vdev.dev.parent = &pci_dev->dev;
- vp_dev->vdev.dev.release = virtio_pci_release_dev;
- vp_dev->pci_dev = pci_dev;
- INIT_LIST_HEAD(&vp_dev->virtqueues);
- spin_lock_init(&vp_dev->lock);
-
- /* enable the device */
- rc = pci_enable_device(pci_dev);
- if (rc)
- goto err_enable_device;
-
- if (force_legacy) {
- rc = virtio_pci_legacy_probe(vp_dev);
- /* Also try modern mode if we can't map BAR0 (no IO space). */
- if (rc == -ENODEV || rc == -ENOMEM)
- rc = virtio_pci_modern_probe(vp_dev);
- if (rc)
- goto err_probe;
- } else {
- rc = virtio_pci_modern_probe(vp_dev);
- if (rc == -ENODEV)
- rc = virtio_pci_legacy_probe(vp_dev);
- if (rc)
- goto err_probe;
- }
-
- pci_set_master(pci_dev);
-
- vp_dev->is_legacy = vp_dev->ldev.ioaddr ? true : false;
-
- rc = register_virtio_device(&vp_dev->vdev);
- reg_dev = vp_dev;
- if (rc)
- goto err_register;
-
- return 0;
-
- err_register:
- if (vp_dev->is_legacy)
- virtio_pci_legacy_remove(vp_dev);
- else
- virtio_pci_modern_remove(vp_dev);
- err_probe:
- pci_disable_device(pci_dev);
- err_enable_device:
- if (reg_dev)
- put_device(&vp_dev->vdev.dev);
- else
- kfree(vp_dev);
- return rc;
- }
(1)virtio_pci_probe函数分配一个virtio_pci_device结构体实例并赋值给vp_dev,用来表示一个virtio PCI代理设备。代码片段如下:
- struct virtio_pci_device *vp_dev, *reg_dev = NULL;
- ……
- /* allocate our structure and fill it out */
- vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
- if (!vp_dev)
- return -ENOMEM;
struct virtio_pci_device的定义在Linux内核源码/drivers/virtio/virtio_pci_common.h中,代码如下:
- /* Our device structure */
- struct virtio_pci_device {
- struct virtio_device vdev;
- struct pci_dev *pci_dev;
- struct virtio_pci_legacy_device ldev;
- struct virtio_pci_modern_device mdev;
-
- bool is_legacy;
-
- /* Where to read and clear interrupt */
- u8 __iomem *isr;
-
- /* a list of queues so we can dispatch IRQs */
- spinlock_t lock;
- struct list_head virtqueues;
-
- /* array of all queues for house-keeping */
- struct virtio_pci_vq_info **vqs;
-
- /* MSI-X support */
- int msix_enabled;
- int intx_enabled;
- cpumask_var_t *msix_affinity_masks;
- /* Name strings for interrupts. This size should be enough,
- * and I'm too lazy to allocate each name separately. */
- char (*msix_names)[256];
- /* Number of available vectors */
- unsigned int msix_vectors;
- /* Vectors allocated, excluding per-vq vectors if any */
- unsigned int msix_used_vectors;
-
- /* Whether we have vector per vq */
- bool per_vq_vectors;
-
- struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
- struct virtio_pci_vq_info *info,
- unsigned int idx,
- void (*callback)(struct virtqueue *vq),
- const char *name,
- bool ctx,
- u16 msix_vec);
- void (*del_vq)(struct virtio_pci_vq_info *info);
-
- u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
- };
(2)将vp_dev设置为该pci_dev的私有结构。代码片段如下:
pci_set_drvdata(pci_dev, vp_dev);
pci_set_drvdata函数在Linux内核源码/include/linux/pci.h中,代码如下:
- static inline void pci_set_drvdata(struct pci_dev *pdev, void *data)
- {
- dev_set_drvdata(&pdev->dev, data);
- }
dev_set_drvdata函数在Linux内核源码/include\linux\device.h中,代码如下:
- static inline void dev_set_drvdata(struct device *dev, void *data)
- {
- dev->driver_data = data;
- }
综上,
pci_set_drvdata(pci_dev, vp_dev);
展开后实际上是:
(&pci_dev->dev)->driver_data = vp_dev;
struct pci_dev的定义在Linux内核源码/include/linux/pci.h中,如下:
- /* The pci_dev structure describes PCI devices */
- struct pci_dev {
- struct list_head bus_list; /* Node in per-bus list */
- struct pci_bus *bus; /* Bus this device is on */
- struct pci_bus *subordinate; /* Bus this device bridges to */
-
- void *sysdata; /* Hook for sys-specific extension */
- struct proc_dir_entry *procent; /* Device entry in /proc/bus/pci */
- struct pci_slot *slot; /* Physical slot this device is in */
-
- unsigned int devfn; /* Encoded device & function index */
- unsigned short vendor;
- unsigned short device;
- unsigned short subsystem_vendor;
- unsigned short subsystem_device;
- unsigned int class; /* 3 bytes: (base,sub,prog-if) */
- u8 revision; /* PCI revision, low byte of class word */
- u8 hdr_type; /* PCI header type (`multi' flag masked out) */
- #ifdef CONFIG_PCIEAER
- u16 aer_cap; /* AER capability offset */
- struct aer_stats *aer_stats; /* AER stats for this device */
- #endif
- #ifdef CONFIG_PCIEPORTBUS
- struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
- struct pci_dev *rcec; /* Associated RCEC device */
- #endif
- u32 devcap; /* PCIe Device Capabilities */
- u8 pcie_cap; /* PCIe capability offset */
- u8 msi_cap; /* MSI capability offset */
- u8 msix_cap; /* MSI-X capability offset */
- u8 pcie_mpss:3; /* PCIe Max Payload Size Supported */
- u8 rom_base_reg; /* Config register controlling ROM */
- u8 pin; /* Interrupt pin this device uses */
- u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */
- unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */
-
- struct pci_driver *driver; /* Driver bound to this device */
- u64 dma_mask; /* Mask of the bits of bus address this
- device implements. Normally this is
- 0xffffffff. You only need to change
- this if your device has broken DMA
- or supports 64-bit transfers. */
-
- struct device_dma_parameters dma_parms;
-
- pci_power_t current_state; /* Current operating state. In ACPI,
- this is D0-D3, D0 being fully
- functional, and D3 being off. */
- unsigned int imm_ready:1; /* Supports Immediate Readiness */
- u8 pm_cap; /* PM capability offset */
- unsigned int pme_support:5; /* Bitmask of states from which PME#
- can be generated */
- unsigned int pme_poll:1; /* Poll device's PME status bit */
- unsigned int d1_support:1; /* Low power state D1 is supported */
- unsigned int d2_support:1; /* Low power state D2 is supported */
- unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
- unsigned int no_d3cold:1; /* D3cold is forbidden */
- unsigned int bridge_d3:1; /* Allow D3 for bridge */
- unsigned int d3cold_allowed:1; /* D3cold is allowed by user */
- unsigned int mmio_always_on:1; /* Disallow turning off io/mem
- decoding during BAR sizing */
- unsigned int wakeup_prepared:1;
- unsigned int skip_bus_pm:1; /* Internal: Skip bus-level PM */
- unsigned int ignore_hotplug:1; /* Ignore hotplug events */
- unsigned int hotplug_user_indicators:1; /* SlotCtl indicators
- controlled exclusively by
- user sysfs */
- unsigned int clear_retrain_link:1; /* Need to clear Retrain Link
- bit manually */
- unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */
- unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */
-
- #ifdef CONFIG_PCIEASPM
- struct pcie_link_state *link_state; /* ASPM link state */
- unsigned int ltr_path:1; /* Latency Tolerance Reporting
- supported from root to here */
- u16 l1ss; /* L1SS Capability pointer */
- #endif
- unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */
- unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */
-
- pci_channel_state_t error_state; /* Current connectivity state */
- struct device dev; /* Generic device interface */
-
- int cfg_size; /* Size of config space */
-
- /*
- * Instead of touching interrupt line and base address registers
- * directly, use the values stored here. They might be different!
- */
- unsigned int irq;
- struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
-
- bool match_driver; /* Skip attaching driver */
-
- unsigned int transparent:1; /* Subtractive decode bridge */
- unsigned int io_window:1; /* Bridge has I/O window */
- unsigned int pref_window:1; /* Bridge has pref mem window */
- unsigned int pref_64_window:1; /* Pref mem window is 64-bit */
- unsigned int multifunction:1; /* Multi-function device */
-
- unsigned int is_busmaster:1; /* Is busmaster */
- unsigned int no_msi:1; /* May not use MSI */
- unsigned int no_64bit_msi:1; /* May only use 32-bit MSIs */
- unsigned int block_cfg_access:1; /* Config space access blocked */
- unsigned int broken_parity_status:1; /* Generates false positive parity */
- unsigned int irq_reroute_variant:2; /* Needs IRQ rerouting variant */
- unsigned int msi_enabled:1;
- unsigned int msix_enabled:1;
- unsigned int ari_enabled:1; /* ARI forwarding */
- unsigned int ats_enabled:1; /* Address Translation Svc */
- unsigned int pasid_enabled:1; /* Process Address Space ID */
- unsigned int pri_enabled:1; /* Page Request Interface */
- unsigned int is_managed:1; /* Managed via devres */
- unsigned int is_msi_managed:1; /* MSI release via devres installed */
- unsigned int needs_freset:1; /* Requires fundamental reset */
- unsigned int state_saved:1;
- unsigned int is_physfn:1;
- unsigned int is_virtfn:1;
- unsigned int is_hotplug_bridge:1;
- unsigned int shpc_managed:1; /* SHPC owned by shpchp */
- unsigned int is_thunderbolt:1; /* Thunderbolt controller */
- /*
- * Devices marked being untrusted are the ones that can potentially
- * execute DMA attacks and similar. They are typically connected
- * through external ports such as Thunderbolt but not limited to
- * that. When an IOMMU is enabled they should be getting full
- * mappings to make sure they cannot access arbitrary memory.
- */
- unsigned int untrusted:1;
- /*
- * Info from the platform, e.g., ACPI or device tree, may mark a
- * device as "external-facing". An external-facing device is
- * itself internal but devices downstream from it are external.
- */
- unsigned int external_facing:1;
- unsigned int broken_intx_masking:1; /* INTx masking can't be used */
- unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */
- unsigned int irq_managed:1;
- unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */
- unsigned int is_probed:1; /* Device probing in progress */
- unsigned int link_active_reporting:1;/* Device capable of reporting link active */
- unsigned int no_vf_scan:1; /* Don't scan for VFs after IOV enablement */
- unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */
- unsigned int rom_bar_overlap:1; /* ROM BAR disable broken */
- pci_dev_flags_t dev_flags;
- atomic_t enable_cnt; /* pci_enable_device has been called */
-
- u32 saved_config_space[16]; /* Config space saved at suspend time */
- struct hlist_head saved_cap_space;
- int rom_attr_enabled; /* Display of ROM attribute enabled? */
- struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
- struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
-
- #ifdef CONFIG_HOTPLUG_PCI_PCIE
- unsigned int broken_cmd_compl:1; /* No compl for some cmds */
- #endif
- #ifdef CONFIG_PCIE_PTM
- u16 ptm_cap; /* PTM Capability */
- unsigned int ptm_root:1;
- unsigned int ptm_enabled:1;
- u8 ptm_granularity;
- #endif
- #ifdef CONFIG_PCI_MSI
- void __iomem *msix_base;
- raw_spinlock_t msi_lock;
- #endif
- struct pci_vpd vpd;
- #ifdef CONFIG_PCIE_DPC
- u16 dpc_cap;
- unsigned int dpc_rp_extensions:1;
- u8 dpc_rp_log_size;
- #endif
- #ifdef CONFIG_PCI_ATS
- union {
- struct pci_sriov *sriov; /* PF: SR-IOV info */
- struct pci_dev *physfn; /* VF: related PF */
- };
- u16 ats_cap; /* ATS Capability offset */
- u8 ats_stu; /* ATS Smallest Translation Unit */
- #endif
- #ifdef CONFIG_PCI_PRI
- u16 pri_cap; /* PRI Capability offset */
- u32 pri_reqs_alloc; /* Number of PRI requests allocated */
- unsigned int pasid_required:1; /* PRG Response PASID Required */
- #endif
- #ifdef CONFIG_PCI_PASID
- u16 pasid_cap; /* PASID Capability offset */
- u16 pasid_features;
- #endif
- #ifdef CONFIG_PCI_P2PDMA
- struct pci_p2pdma __rcu *p2pdma;
- #endif
- u16 acs_cap; /* ACS Capability offset */
- phys_addr_t rom; /* Physical address if not from BAR */
- size_t romlen; /* Length if not from BAR */
- /*
- * Driver name to force a match. Do not set directly, because core
- * frees it. Use driver_set_override() to set or clear it.
- */
- const char *driver_override;
-
- unsigned long priv_flags; /* Private flags for the PCI driver */
-
- /* These methods index pci_reset_fn_methods[] */
- u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
- };
(3) 初始化 vp_dev即virtio_pci_device结构中virtio_device类型的vdev成员相关结构。代码片段如下:
- vp_dev->vdev.dev.parent = &pci_dev->dev;
- vp_dev->vdev.dev.release = virtio_pci_release_dev;
- vp_dev->pci_dev = pci_dev;
- INIT_LIST_HEAD(&vp_dev->virtqueues);
- spin_lock_init(&vp_dev->lock);
virtio_pci_probe函数其余部分代码的解析,请看下回。