PCPU路由缓存


路由查找

与IPv4不同,IPv6的出口路由和入口路由都使用函数ip6_pol_route实现,区别在于传入的接口索引参数不同

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
                        struct fib6_table *table,
                        struct flowi6 *fl6,
                        const struct sk_buff *skb,
                        int flags)
{
    return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
                         struct fib6_table *table,
                         struct flowi6 *fl6,
                         const struct sk_buff *skb,
                         int flags)
{
    return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
/*
正常添加路由命令是通过ip6_pol_route_lookup->fib6_lookup->fib6_looup_1查表;

其他方式查找路由:ip6_pol_route->fib6_lookup->fib6_looup_1  —通过该函数查表.

*/
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
                      struct flowi6 *fl6, int flags)
{
    struct fib6_node *fn, *saved_fn;
    struct rt6_info *rt;
    int strict = 0;

    strict |= flags & RT6_LOOKUP_F_IFACE;
    if (net->ipv6.devconf_all->forwarding == 0)
        strict |= RT6_LOOKUP_F_REACHABLE;

    read_lock_bh(&table->tb6_lock);
// find leaf node
    fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
    saved_fn = fn;

    if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
        oif = 0;

redo_rt6_select:
    rt = rt6_select(fn, oif, strict);
    if (rt->rt6i_nsiblings)
        rt = rt6_multipath_select(rt, fl6, oif, strict);
    if (rt == net->ipv6.ip6_null_entry) {
        fn = fib6_backtrack(fn, &fl6->saddr);
        if (fn)
            goto redo_rt6_select;
        else if (strict & RT6_LOOKUP_F_REACHABLE) {
            /* also consider unreachable route */
            strict &= ~RT6_LOOKUP_F_REACHABLE;
            fn = saved_fn;
            goto redo_rt6_select;
        }
    }

   //查找缓存的路由,如果找到,则返回此值

    if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
        dst_use(&rt->dst, jiffies);
        read_unlock_bh(&table->tb6_lock);

        rt6_dst_from_metrics_check(rt);
        return rt;
    } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                !(rt->rt6i_flags & RTF_GATEWAY))) {
        /* Create a RTF_CACHE clone which will not be
         * owned by the fib6 tree.  It is for the special case where
         * the daddr in the skb during the neighbor look-up is different
         * from the fl6->daddr used to look-up route here.
         否则,判断流结构flowi6是否设置了FLOWI_FLAG_KNOWN_NH,并且没有设置了下一跳网关的地址组,
         这种已知下一跳的前提下查找路由的情况不常见。而且,由于在fl6结构目的地址成员daddr使用的是下一跳地址,
         而不是skb报文中的目的地址,此时创建的路由缓存项不会缓存在fib6树种,
         将其添加到uncached_list链表

         */

        struct rt6_info *uncached_rt;

        dst_use(&rt->dst, jiffies);
        read_unlock_bh(&table->tb6_lock);

        uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
        dst_release(&rt->dst);

        if (uncached_rt)
            rt6_uncached_list_add(uncached_rt);
        else
            uncached_rt = net->ipv6.ip6_null_entry;

        dst_hold(&uncached_rt->dst);
        return uncached_rt;

    } else {
        /* Get a percpu copy 
        以上两种情况都没有成立,分配每处理器路由缓存项,其过程中将缓存路由项,不必加到uncached_list链表*/

        struct rt6_info *pcpu_rt;

        rt->dst.lastuse = jiffies;
        rt->dst.__use++;
        pcpu_rt = rt6_get_pcpu_route(rt);

        if (pcpu_rt) {
            read_unlock_bh(&table->tb6_lock);
        } else {
            /* We have to do the read_unlock first
             * because rt6_make_pcpu_route() may trigger
             * ip6_dst_gc() which will take the write_lock.
             */
            dst_hold(&rt->dst);
            read_unlock_bh(&table->tb6_lock);
            pcpu_rt = rt6_make_pcpu_route(rt);
            dst_release(&rt->dst);
        }

        return pcpu_rt;

    }
}

  对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表。

/*
对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。
对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,
导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表 如果放入uncache_list链表, 则需要开启timer 定时回收。
*/
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
                  struct flowi6 *fl6)
{
    struct dst_entry *dst;
    struct rt6_info *rt;
    struct inet6_dev *idev = in6_dev_get(dev);
    struct net *net = dev_net(dev);

    if (unlikely(!idev))
        return ERR_PTR(-ENODEV);

    rt = ip6_dst_alloc(net, dev, 0);
    if (unlikely(!rt)) {
        in6_dev_put(idev);
        dst = ERR_PTR(-ENOMEM);
        goto out;
    }

    rt->dst.flags |= DST_HOST;
    rt->dst.input = ip6_input;
    rt->dst.output  = ip6_output;
    atomic_set(&rt->dst.__refcnt, 1);
    rt->rt6i_gateway  = fl6->daddr;
    rt->rt6i_dst.addr = fl6->daddr;
    rt->rt6i_dst.plen = 128;
    rt->rt6i_idev     = idev;
    dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);

    spin_lock_bh(&icmp6_dst_lock);
    rt->dst.next = icmp6_dst_gc_list;
    icmp6_dst_gc_list = &rt->dst;
    spin_unlock_bh(&icmp6_dst_lock);

    fib6_force_start_gc(net);

    dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

out:
    return dst;
}

uncached路由缓存清除

当接口被注销或者down时,由函数rt6_uncached_list_flush_dev清除设备相关的uncached路由缓存

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
    struct net_device *loopback_dev = net->loopback_dev;
    int cpu;

    if (dev == loopback_dev)
        return;
//遍历所有的rt6_uncached_list中的路由缓存,将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev,
//并且将路由项的inet6_dev换成回环接口对应的inet6_dev。实际上并没有将路由缓存项从uncached_list链表中删除
    for_each_possible_cpu(cpu) {
        struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
        struct rt6_info *rt;

        spin_lock_bh(&ul->lock);
        list_for_each_entry(rt, &ul->head, rt6i_uncached) {
            struct inet6_dev *rt_idev = rt->rt6i_idev;
            struct net_device *rt_dev = rt->dst.dev;

            if (rt_idev->dev == dev) {
                rt->rt6i_idev = in6_dev_get(loopback_dev);
                in6_dev_put(rt_idev);
            }

            if (rt_dev == dev) {
                rt->dst.dev = loopback_dev;//将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev/loopback_dev,
                dev_hold(rt->dst.dev);
                dev_put(rt_dev);
            }
        }
        spin_unlock_bh(&ul->lock);
    }
}

PCPU路由缓存查找

  如果路由查询结果中rt6_info成员rt6i_pcpu有值,表明缓存存在,直接返回其值。

/* It should be called with read_lock_bh(&tb6_lock) acquired */
static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
{
    struct rt6_info *pcpu_rt, **p;

    p = this_cpu_ptr(rt->rt6i_pcpu); 
    pcpu_rt = *p;

    if (pcpu_rt) {
        dst_hold(&pcpu_rt->dst);
        rt6_dst_from_metrics_check(pcpu_rt);
    }
    return pcpu_rt;
}

   函数ip6_rt_pcpu_alloc负责分配初始化每处理器路由缓存,如果在内核在删除路由信息,即在函数fib6_drop_pcpu_from将路由信息的fib6_destroying设置为1,此种情况下,应当释放每处理器路由缓存依据的路由信息。

static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
{
    struct fib6_table *table = rt->rt6i_table;
    struct rt6_info *pcpu_rt, *prev, **p;
 //分配路由缓存,并进行初始化,设置RTF_PCPU标志
    pcpu_rt = ip6_rt_pcpu_alloc(rt);
    if (!pcpu_rt) {
        struct net *net = dev_net(rt->dst.dev);

        dst_hold(&net->ipv6.ip6_null_entry->dst);
        return net->ipv6.ip6_null_entry;
    }

    read_lock_bh(&table->tb6_lock);
    if (rt->rt6i_pcpu) {
        p = this_cpu_ptr(rt->rt6i_pcpu);
        prev = cmpxchg(p, NULL, pcpu_rt);
        if (prev) {
            /* If someone did it before us, return prev instead */
            dst_destroy(&pcpu_rt->dst);
            pcpu_rt = prev;
        }
    } else {
        /* rt has been removed from the fib6 tree
         * before we have a chance to acquire the read_lock.
         * In this case, don't brother to create a pcpu rt
         * since rt is going away anyway.  The next
         * dst_check() will trigger a re-lookup.
         */// 此处的逻辑要注意  直接释放生成的pcpurt
        dst_destroy(&pcpu_rt->dst);
        pcpu_rt = rt;
    }
    dst_hold(&pcpu_rt->dst);
    rt6_dst_from_metrics_check(pcpu_rt);
    read_unlock_bh(&table->tb6_lock);
    return pcpu_rt;
}

相关