前言
在 PCI 总线下,当 PCIe 设备和 PCIe 驱动匹配后,就会执行驱动的 probe() 函数来初始化设备,以让设备正常运行。
在 probe() 函数中,最先做的事情就是执行 pci_enable_device() 来使能设备。如果设备都无法使能的话,那就直接返回,接下来的工作也就不用做了。
代码解析
pci_enable_device()
的实现在内核 pci 子系统中,所在文件为 drivers/pci/pci.c
/*** pci_enable_device - Initialize device before it's used by a driver.* @dev: PCI device to be initialized** Initialize device before it's used by a driver. Ask low-level code* to enable I/O and memory. Wake up the device if it was suspended.* Beware, this function can fail.** Note we don't actually enable the device many times if we call* this function repeatedly (we just increment the count).*/
int pci_enable_device(struct pci_dev *dev)
{return pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO);
}
在 pci_enable_device() 中调用 pci_enable_device_flags()
,pci_enable_device_flags() 实现如下
static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
{struct pci_dev *bridge;int err;int i, bars = 0;/** Power state could be unknown at this point, either due to a fresh* boot or a device removal call. So get the current power state* so that things like MSI message writing will behave as expected* (e.g. if the device really is in D0 at enable time).*/if (dev->pm_cap) {u16 pmcsr;pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);}if (atomic_inc_return(&dev->enable_cnt) > 1)return 0; /* already enabled */bridge = pci_upstream_bridge(dev);if (bridge)pci_enable_bridge(bridge);/* only skip sriov related */for (i = 0; i <= PCI_ROM_RESOURCE; i++)if (dev->resource[i].flags & flags)bars |= (1 << i);for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++)if (dev->resource[i].flags & flags)bars |= (1 << i);err = do_pci_enable_device(dev, bars);if (err < 0)atomic_dec(&dev->enable_cnt);return err;
}
第 15 行调用 pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
读取电源管理寄存器(起始地址:0x40)的 PMCSR 字段(地址:0x40 + 4),该字段占 3 个字节,但最高 1 字节为保留位,所以有效字节为 2 字节。所以代码中使用 u16 类型的变量 pmcsr 来存储读到的值。
在这里插一句:
PCI 设备使用的基本配置空间由 64 个字节组成,地址范围为 0x00 ~ 0x3F,这 64 个字节是所有 PCI 设备必须支持的。事实上,许多 PCI 设备也仅支持这 64 个配置寄存器。
此外,PCIe 设备还扩展了 0x40 ~ 0xFF 这段配置空间,这段配置空间主要存放一些与 MSI 中断机制和电源管理相关的 Capability 结构。
以下是《PCI EXPRESS 体系结构导读》中对电源管理寄存器的介绍
从规范中可以看到,PMCSR 的第 0~1 bits 为 Power State 字段,所以代码中该字段的掩码为 0x3。
pmcsr 与掩码按位与就得到了 Power State,将结果赋值给 dev->current_state。
#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */
dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
下面这两行代码应验了 pci_enable_device() 函数的注释: we don’t actually enable the device many times if we call this function repeatedly (we just increment the count).
if (atomic_inc_return(&dev->enable_cnt) > 1)return 0; /* already enabled */
接下来的三行代码,作用是,使能该 PCIe 设备的上游桥片,upstream 这个概念也是 PCI 规范中出现的。
使能上游桥片,想想也是理所当然,如果上游桥片不工作,那么处于该桥片下面的设备怎么可能正常工作呢。
bridge = pci_upstream_bridge(dev);if (bridge)pci_enable_bridge(bridge);
接着获取 bar 掩码,表示使用了哪几个 BAR,对于 PCIe 设备而言,最多有 6 个 BAR 空间,即 BAR0~BAR5。
/* only skip sriov related */for (i = 0; i <= PCI_ROM_RESOURCE; i++)if (dev->resource[i].flags & flags)bars |= (1 << i);for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++)if (dev->resource[i].flags & flags)bars |= (1 << i);
接着就到了 do_pci_enable_device(dev, bars)
,看下 do_pci_enable_device() 的具体实现
static int do_pci_enable_device(struct pci_dev *dev, int bars)
{int err;struct pci_dev *bridge;u16 cmd;u8 pin;err = pci_set_power_state(dev, PCI_D0);if (err < 0 && err != -EIO)return err;bridge = pci_upstream_bridge(dev);if (bridge)pcie_aspm_powersave_config_link(bridge);err = pcibios_enable_device(dev, bars);if (err < 0)return err;pci_fixup_device(pci_fixup_enable, dev);if (dev->msi_enabled || dev->msix_enabled)return 0;pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);if (pin) {pci_read_config_word(dev, PCI_COMMAND, &cmd);if (cmd & PCI_COMMAND_INTX_DISABLE)pci_write_config_word(dev, PCI_COMMAND,cmd & ~PCI_COMMAND_INTX_DISABLE);}return 0;
}
第 8 行 pci_set_power_state(dev, PCI_D0)
将设备设置为 D0 状态,
pci_set_power_state() 具体实现如下
/*** pci_set_power_state - Set the power state of a PCI device* @dev: PCI device to handle.* @state: PCI power state (D0, D1, D2, D3hot) to put the device into.** Transition a device to a new power state, using the platform firmware and/or* the device's PCI PM registers.** RETURN VALUE:* -EINVAL if the requested state is invalid.* -EIO if device does not support PCI PM or its PM capabilities register has a* wrong version, or device doesn't support the requested state.* 0 if the transition is to D1 or D2 but D1 and D2 are not supported.* 0 if device already is in the requested state.* 0 if the transition is to D3 but D3 is not supported.* 0 if device's power state has been successfully changed.*/
int pci_set_power_state(struct pci_dev *dev, pci_power_t state)
{int error;/* bound the state we're entering */if (state > PCI_D3cold)state = PCI_D3cold;else if (state < PCI_D0)state = PCI_D0;else if ((state == PCI_D1 || state == PCI_D2) && pci_no_d1d2(dev))/** If the device or the parent bridge do not support PCI PM,* ignore the request if we're doing anything other than putting* it into D0 (which would only happen on boot).*/return 0;/* Check if we're already there */if (dev->current_state == state)return 0;__pci_start_power_transition(dev, state);/* This device is quirked not to be put into D3, sodon't put it in D3 */if (state >= PCI_D3hot && (dev->dev_flags & PCI_DEV_FLAGS_NO_D3))return 0;/** To put device in D3cold, we put device into D3hot in native* way, then put device into D3cold with platform ops*/error = pci_raw_set_power_state(dev, state > PCI_D3hot ?PCI_D3hot : state);if (!__pci_complete_power_transition(dev, state))error = 0;return error;
}
其中 D0、D1、D2、D3cold、D3hot
状态及其转换关系在 PCIe 规范中都有明确规定,
该段代码调用 pci_raw_set_power_state(dev, state > PCI_D3hot ? PCI_D3hot : state);
将 PCIe 设备设置为 D0 状态,pci_raw_set_power_state() 函数的实现如下,
/*** pci_raw_set_power_state - Use PCI PM registers to set the power state of* given PCI device* @dev: PCI device to handle.* @state: PCI power state (D0, D1, D2, D3hot) to put the device into.** RETURN VALUE:* -EINVAL if the requested state is invalid.* -EIO if device does not support PCI PM or its PM capabilities register has a* wrong version, or device doesn't support the requested state.* 0 if device already is in the requested state.* 0 if device's power state has been successfully changed.*/
static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state)
{u16 pmcsr;bool need_restore = false;/* Check if we're already there */if (dev->current_state == state)return 0;if (!dev->pm_cap)return -EIO;if (state < PCI_D0 || state > PCI_D3hot)return -EINVAL;/* Validate current state:* Can enter D0 from any state, but if we can only go deeper* to sleep if we're already in a low power state*/if (state != PCI_D0 && dev->current_state <= PCI_D3cold&& dev->current_state > state) {dev_err(&dev->dev, "invalid power transition (from state %d to %d)\n",dev->current_state, state);return -EINVAL;}/* check if this device supports the desired state */if ((state == PCI_D1 && !dev->d1_support)|| (state == PCI_D2 && !dev->d2_support))return -EIO;pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);/* If we're (effectively) in D3, force entire word to 0.* This doesn't affect PME_Status, disables PME_En, and* sets PowerState to 0.*/switch (dev->current_state) {case PCI_D0:case PCI_D1:case PCI_D2:pmcsr &= ~PCI_PM_CTRL_STATE_MASK;pmcsr |= state;break;case PCI_D3hot:case PCI_D3cold:case PCI_UNKNOWN: /* Boot-up */if ((pmcsr & PCI_PM_CTRL_STATE_MASK) == PCI_D3hot&& !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET))need_restore = true;/* Fall-through: force to D0 */default:pmcsr = 0;break;}/* enter specified state */pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);/* Mandatory power management transition delays *//* see PCI PM 1.1 5.6.1 table 18 */if (state == PCI_D3hot || dev->current_state == PCI_D3hot)pci_dev_d3_sleep(dev);else if (state == PCI_D2 || dev->current_state == PCI_D2)udelay(PCI_PM_D2_DELAY);pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);if (dev->current_state != state && printk_ratelimit())dev_info(&dev->dev, "Refused to change power state, currently in D%d\n",dev->current_state);/** According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT* INTERFACE SPECIFICATION, REV. 1.2", a device transitioning* from D3hot to D0 _may_ perform an internal reset, thereby* going to "D0 Uninitialized" rather than "D0 Initialized".* For example, at least some versions of the 3c905B and the* 3c556B exhibit this behaviour.** At least some laptop BIOSen (e.g. the Thinkpad T21) leave* devices in a D3hot state at boot. Consequently, we need to* restore at least the BARs so that the device will be* accessible to its driver.*/if (need_restore)pci_restore_bars(dev);if (dev->bus->self)pcie_aspm_pm_state_change(dev->bus->self);return 0;
}
第 45 行读取设备当前的 pmcsr 寄存器值,
第 55、56 行将该寄存器中的 state 字段设置为指定值(D0 = 0),
第 71 行将 pmcsr 写入设备的配置空间,
完成 PCIe 设备的使能工作。
总结
可以看到,驱动代码是 PCIe 规范的具体实现,
如果你没看过规范,那么当你看到如下宏定义时
#define PCI_D0 ((pci_power_t __force) 0)
#define PCI_D1 ((pci_power_t __force) 1)
#define PCI_D2 ((pci_power_t __force) 2)
#define PCI_D3hot ((pci_power_t __force) 3)
#define PCI_D3cold ((pci_power_t __force) 4)
#define PCI_UNKNOWN ((pci_power_t __force) 5)
#define PCI_POWER_ERROR ((pci_power_t __force) -1)
你又怎会明白这些 D0、D1、D2、D3hot、D3cold
是什么意思?
没看过规范,你又怎会明白 pmcsr
变量是什么意思?
没看过规范,你又怎会知道 Power State 的掩码要设置为 0x3
#define PCI_PM_CTRL_STATE_MASK 0x0003
甚至,驱动和内核代码中,常常可以见到直接以规范的章节号、表格号作为注释,如
/* see PCI PM 1.1 5.6.1 table 18 */
/* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT* INTERFACE SPECIFICATION, REV. 1.2" ... */
意思很明显,作者觉得三言两语也解释不清这段代码的作用,就直接贴出来了规范的章节号,请读者自己去看规范吧。
下面这个例子更具有说服力,
static void pci_flr_wait(struct pci_dev *dev)
{int delay = 1, timeout = 60000;u32 id;/** Per PCIe r3.1, sec 6.6.2, a device must complete an FLR within* 100ms, but may silently discard requests while the FLR is in* progress. Wait 100ms before trying to access the device.*/msleep(100);/** After 100ms, the device should not silently discard config* requests, but it may still indicate that it needs more time by* responding to them with CRS completions. The Root Port will* generally synthesize ~0 data to complete the read (except when* CRS SV is enabled and the read was for the Vendor ID; in that* case it synthesizes 0x0001 data).** Wait for the device to return a non-CRS completion. Read the* Command register instead of Vendor ID so we don't have to* contend with the CRS SV value.*/pci_read_config_dword(dev, PCI_COMMAND, &id);while (id == ~0) {if (delay > timeout) {dev_warn(&dev->dev, "not ready %dms after FLR; giving up\n",100 + delay - 1);return;}if (delay > 1000)dev_info(&dev->dev, "not ready %dms after FLR; waiting\n",100 + delay - 1);msleep(delay);delay *= 2;pci_read_config_dword(dev, PCI_COMMAND, &id);}if (delay > 1000)dev_info(&dev->dev, "ready %dms after FLR\n", 100 + delay - 1);
}
第 11 行,直接来了个 msleep(100);
,还好作者写了注释:根据 PCIe r3.1,第 6.6.2 节,设备必须在 100ms 内完成一个 FLR
。不过,即便有这个注释,你要想真正弄明白为什么要睡眠 100ms,还是要去阅读 PCIe 3.1 规范的 6.6.2 节才行。
所以,借用《Linux 那些事儿之我是 U 盘》中的一句话:
从协议中来,到协议中去。