背景
最近在排查一个网络问题,ifconfig eth0 up 后,网卡link up比较慢。因此,分析了下从ifconfig up 到网络驱动的调用流程。这里顺便作个记录。
ifconfig eth0 up 调用的是busybox 的命令,因此从busybox 源码入手,逐步分析下调用流程。代码介绍文件位于:networking/ifenslave.c
ifconfig eth0 up
ifconfig eth0 up 和 ifconfig eth0 down 分别对应busybox 的set_if_up()和set_if_down().
static int set_if_down(char *ifname, int flags) { int res = set_if_flags(ifname, flags & ~IFF_UP); if (res) bb_perror_msg("%s: can't down", ifname); return res; }
static int set_if_up(char *ifname, int flags) { int res = set_if_flags(ifname, flags | IFF_UP); if (res) bb_perror_msg("%s: can't up", ifname); return res; }
比如,当我们敲ifconfig eth0 down时,实则就是调用:
set_if_down("eth0", master_flags.ifr_flags);
set_if_flags()会将网卡名,up / down 标志位flags通过ioctl命令SIOCSIFFLAGS 传递给内核网卡驱动。
static int set_if_flags(char *ifname, int flags) { struct ifreq ifr; ifr.ifr_flags = flags; return set_ifrname_and_do_ioctl(SIOCSIFFLAGS, &ifr, ifname); }
dev_ifsioc
接着深入到内核代码中,看下SIOCSIFFLAGS命令在哪里实现。代码位于kernel etcoredev_ioctl.c。
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; if (!dev) return -ENODEV; ops = dev->netdev_ops; switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ return dev_change_flags(dev, ifr->ifr_flags); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ return -EOPNOTSUPP; ................... } return err; }
dev_ifsioc()会调用__dev_get_by_name()根据 网卡名遍历 net链表,如果匹配到则返回net_device结构体指针。接着,SIOCSIFFLAGS会调用到dev_change_flags(),最后调用到__dev_change_flags()。
dev_change_flags
int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes); return ret; }
int __dev_change_flags(struct net_device *dev, unsigned int flags) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; /* 两个标识有一个是IFF_UP */ if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); // 通过flags 判断调用__dev_close 还是 __dev_open if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI is important. Some (broken) drivers set IFF_PROMISC, when IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; }
在__dev_change_flags(dev, flags)函数中,通过判断flag的IFF_UP位上的值是否相反,来实现是调用__dev_close()还是__dev_open()来开关eth0。
__dev_close
__dev_close中会将当前的net_device加入到等待设备关闭列表中。
static int __dev_close(struct net_device *dev) { int retval; LIST_HEAD(single); list_add(&dev->close_list, &single); retval = __dev_close_many(&single); list_del(&single); return retval; }
__dev_close_many
__dev_close_many通知设备正在关闭,等待未发送完的数据发送完,最后清除开启标记。
static int __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ /* 禁用netpoll */ netpoll_poll_disable(dev); /* 通知设备正在关闭 */ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); /* 清除start标志位 */ clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } /* 未发送完的数据发送完 */ dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ /* 调用设备关闭操作 */ if (ops->ndo_stop) ops->ndo_stop(dev); /* 标记设备关闭 */ dev->flags &= ~IFF_UP; /* 启用netpoll */ netpoll_poll_enable(dev); } return 0; }
ndo_stop
ndo_stop为关闭网卡时,不同网卡驱动注册的不同的关闭函数,我们以海思的网卡驱动为例,分析下ndo_stop函数的实现。代码位于kerneldrivers etethernethisiliconhnshns_enet.c。
hns_nic_net_stop
static int hns_nic_net_stop(struct net_device *ndev) { hns_nic_net_down(ndev); return 0; }
hns_nic_net_down
static void hns_nic_net_down(struct net_device *ndev) { int i; struct hnae_ae_ops *ops; struct hns_nic_priv *priv = netdev_priv(ndev); if (test_and_set_bit(NIC_STATE_DOWN, &priv->state)) return; (void)del_timer_sync(&priv->service_timer); netif_tx_stop_all_queues(ndev); netif_carrier_off(ndev); netif_tx_disable(ndev); priv->link = 0; if (priv->phy) phy_stop(priv->phy); ops = priv->ae_handle->dev->ops; if (ops->stop) ops->stop(priv->ae_handle); netif_tx_stop_all_queues(ndev); for (i = priv->ae_handle->q_num - 1; i >= 0; i--) { hns_nic_ring_close(ndev, i); hns_nic_ring_close(ndev, i + priv->ae_handle->q_num); /* clean tx buffers*/ hns_nic_tx_clr_all_bufs(priv->ring_data + i); } }
hns_nic_net_down()中会调用netif_carrier_off()通知内核子系统网络断开。下面我们详细分析下netif_carrier_off()的实现。
netif_carrier_off()
void netif_carrier_off(struct net_device *dev) { /* 设置网卡为载波断开状态 即nocarrier状态,上行时软中断下半部读到该状态不会进行网卡收包 */ if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; /* 增加设备改变状态 */ atomic_inc(&dev->carrier_changes); /* 加入事件处理队列进行处理 */ linkwatch_fire_event(dev); } }
linkwatch_fire_event()
linkwatch_fire_event()函数将设备加入到事件队列,并且进行事件调度,调度中会根据是否为紧急事件做不同处理。
void linkwatch_fire_event(struct net_device *dev) { /* 判断是否是紧急处理的事件 */ bool urgent = linkwatch_urgent_event(dev); /* 判断是否是紧急处理的事件 */ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* 添加事件到事件列表 */ linkwatch_add_event(dev); } else if (!urgent) /* 设备以前已经设置了pending标记,不是紧急事件,直接返回 */ return; /* 事件调度 */ linkwatch_schedule_work(urgent); }
linkwatch_urgent_event()
linkwatch_urgent_event()判断是否是否需要紧急处理。
static bool linkwatch_urgent_event(struct net_device *dev) { /* 设备未运行,非紧急 */ if (!netif_running(dev)) return false; /* 设备的索引号与连接索引号不等,紧急 */ if (dev->ifindex != dev_get_iflink(dev)) return true; /* 设备作为team port,紧急 */ if (dev->priv_flags & IFF_TEAM_PORT) return true; /* 连接与否 && 发送队列排队规则改变与否 */ return netif_carrier_ok(dev) && qdisc_tx_changing(dev); }
linkwatch_add_event()
linkwatch_add_event()将设备加入到事件处理链表。
static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); /* 若未添加,则添加设备到事件列表 */ if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); dev_hold(dev); } spin_unlock_irqrestore(&lweventlist_lock, flags); }
linkwatch_schedule_work()
linkwatch_schedule_work()对事件处理进行调度,紧急事件立即执行,非紧急事件延后执行。
static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; /* 已经设置了紧急标记,则返回 */ if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* 需要紧急调度 */ if (urgent) { /* 之前设置了,则返回 */ if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; /* 未设置紧急,则立即执行 */ delay = 0; } /* 如果大于1s则立即执行 */ if (delay > HZ) delay = 0; /* 如果设置了紧急标记,则立即执行 */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_wq, &linkwatch_work, 0); else /* 未设置紧急标记,则按照delay执行 */ schedule_delayed_work(&linkwatch_work, delay); }
__linkwatch_run_queue()
__linkwatch_run_queue()完成对事件调度队列中设备的处理。
static void __linkwatch_run_queue(int urgent_only) { struct net_device *dev; LIST_HEAD(wrk); /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ /* 已达到调度时间 */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ /* 未到达调度时间,并且下一次调度在当前时间的1s以后 那么设置调度时间是当前时间 */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; /* 清除紧急标识 */ clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); /* 遍历链表 */ while (!list_empty(&wrk)) { /* 获取设备 */ dev = list_first_entry(&wrk, struct net_device, link_watch_list); /* 从链表移除设备 */ list_del_init(&dev->link_watch_list); /* 未到达调度时间 && 不需要紧急处理 */ if (urgent_only && !linkwatch_urgent_event(dev)) { /* 添加到链表尾部 */ list_add_tail(&dev->link_watch_list, &lweventlist); /* 继续处理 */ continue; } spin_unlock_irq(&lweventlist_lock); /* 处理设备 */ linkwatch_do_dev(dev); spin_lock_irq(&lweventlist_lock); } /* 链表有未处理事件,则以非紧急状态调度队列 */ if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); }
linkwatch_do_dev()
linkwatch_do_dev()完成对某个设备的状态改变处理。
static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ /* 清除pending标记 */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); /* 如果设备启动状态 */ if (dev->flags & IFF_UP) { /* 链路连接 */ if (netif_carrier_ok(dev)) /* 启用排队规则 */ dev_activate(dev); else /* 关闭排队规则 */ dev_deactivate(dev); /* 设备状态改变处理,执行netdev_chain上设备状态变更回调 */ netdev_state_change(dev); } dev_put(dev); }
phy_stop()
最后,hns_nic_net_down()中会调用phy_stop()将网卡link down。
void phy_stop(struct phy_device *phydev) { mutex_lock(&phydev->lock); if (PHY_HALTED == phydev->state) goto out_unlock; if (phy_interrupt_is_valid(phydev)) { /* Disable PHY Interrupts */ phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED); /* Clear any pending interrupts */ phy_clear_interrupt(phydev); } phydev->state = PHY_HALTED; out_unlock: mutex_unlock(&phydev->lock); /* Cannot call flush_scheduled_work() here as desired because * of rtnl_lock(), but PHY_HALTED shall guarantee phy_change() * will not reenable interrupts. */ }
phy_stop()将phydev->state设置为PHY_HALTED,将网卡关闭。
__dev_open
__dev_open为设备启用核心函数,该函数打开eth0,设置启用标记,并且设置接收模式,排队规则等。
static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); /* 设备不可用 */ if (!netif_device_present(dev)) return -ENODEV; /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ /* 禁用netpoll */ netpoll_poll_disable(dev); /* 设备打开前通知 */ ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); if (ret) return ret; /* 设置设备打开标记,设备将设置IFF_UP标志位*/ set_bit(__LINK_STATE_START, &dev->state); /* 校验地址 */ if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); /* 执行打开 */ if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); /* 启用netpoll */ netpoll_poll_enable(dev); /* 失败,清除打开标记 */ if (ret) clear_bit(__LINK_STATE_START, &dev->state); /* 设备打开操作 */ else { /* 设置打开标记 */ dev->flags |= IFF_UP; /* 设置接收模式 */ dev_set_rx_mode(dev); /* 初始化排队规则 */ dev_activate(dev); /* 加入设备数据到熵池 */ add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; }
hns_nic_net_open()
我们以海思的网卡驱动为例,分析下ndo_open()函数的实现。代码位于kerneldrivers etethernethisiliconhnshns_enet.c。
static int hns_nic_net_open(struct net_device *ndev) { struct hns_nic_priv *priv = netdev_priv(ndev); struct hnae_handle *h = priv->ae_handle; int ret; if (test_bit(NIC_STATE_TESTING, &priv->state)) return -EBUSY; priv->link = 0; netif_carrier_off(ndev); /*设置tx queue的个数*/ ret = netif_set_real_num_tx_queues(ndev, h->q_num); if (ret < 0) { netdev_err(ndev, "netif_set_real_num_tx_queues fail, ret=%d! ", ret); return ret; } /*设置rx queue的个数*/ ret = netif_set_real_num_rx_queues(ndev, h->q_num); if (ret < 0) { netdev_err(ndev, "netif_set_real_num_rx_queues fail, ret=%d! ", ret); return ret; } /*启动网卡*/ ret = hns_nic_net_up(ndev); if (ret) { netdev_err(ndev, "hns net up fail, ret=%d! ", ret); return ret; } return 0; }
hns_nic_net_up()
static int hns_nic_net_up(struct net_device *ndev) { struct hns_nic_priv *priv = netdev_priv(ndev); struct hnae_handle *h = priv->ae_handle; int i, j, k; int ret; /*初始化中断,并设置中断函数为hns_irq_handle,每个rx和tx queue都对应一个中断*/ ret = hns_nic_init_irq(priv); if (ret != 0) { netdev_err(ndev, "hns init irq failed! ret=%d ", ret); return ret; } for (i = 0; i < h->q_num * 2; i++) { /*使能中断,使能napi*/ ret = hns_nic_ring_open(ndev, i); if (ret) goto out_has_some_queues; } for (k = 0; k < h->q_num; k++) h->dev->ops->toggle_queue_status(h->qs[k], 1); /*设置mac地址*/ ret = h->dev->ops->set_mac_addr(h, ndev->dev_addr); if (ret) goto out_set_mac_addr_err; /*hns的start函数为null*/ ret = h->dev->ops->start ? h->dev->ops->start(h) : 0; if (ret) goto out_start_err; if (priv->phy) /*启动phy*/ phy_start(priv->phy); clear_bit(NIC_STATE_DOWN, &priv->state); /*修改time 每一秒到期一次*/ (void)mod_timer(&priv->service_timer, jiffies + SERVICE_TIMER_HZ); return 0; out_start_err: netif_stop_queue(ndev); out_set_mac_addr_err: for (k = 0; k < h->q_num; k++) h->dev->ops->toggle_queue_status(h->qs[k], 0); out_has_some_queues: for (j = i - 1; j >= 0; j--) hns_nic_ring_close(ndev, j); set_bit(NIC_STATE_DOWN, &priv->state); return ret; }
phy_start()
最后会调用到phy_start()启动网卡。
void phy_start(struct phy_device *phydev) { bool do_resume = false; int err = 0; mutex_lock(&phydev->lock); switch (phydev->state) { case PHY_STARTING: phydev->state = PHY_PENDING; break; case PHY_READY: phydev->state = PHY_UP; break; case PHY_HALTED: /* make sure interrupts are re-enabled for the PHY */ err = phy_enable_interrupts(phydev); if (err < 0) break; phydev->state = PHY_RESUMING; do_resume = true; break; default: break; } mutex_unlock(&phydev->lock); /* if phy was suspended, bring the physical link up again */ if (do_resume) phy_resume(phydev); }
审核编辑:刘清
全部0条评论
快来发表一下你的评论吧 !