首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 操作系统 > UNIXLINUX >

Linux环境停libpcap库源代码分析

2013-10-18 
Linux环境下libpcap库源代码分析linux环境下libpcap 源代码分析韩大卫@吉林师范大学libpcap 源代码官方下

Linux环境下libpcap库源代码分析

linux环境下libpcap 源代码分析韩大卫@吉林师范大学libpcap 源代码官方下载地址:git clone https://github.com/the-tcpdump-group/libpcap.gittcpdumpm源代码官方下载地址:git clone git://bpf.tcpdump.org/tcpdumptcpdump.c使用libpcap里的pcap_open_live和pcap_loop 完成两个最关键的动作:获取捕获报文的接口,和捕获报文并将报文交给callback。 (关于tcpdump源代码的构架,请参考作者的tcpdump源代码分析) 现结合libpcap源代码分析pcap_open_live和pcap_loop的实现机制,并进入linux内核,展示linux内核对这两个API的响应动作。tcpdump.c对pcap_open_live的使用是:pd = pcap_open_live(device, snaplen, !pflag, 1000, ebuf); pcap_open_live定义如下:pcap_t *pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf)source 为指定的网络接口。snaplen 为最大报文长度。Promisc 是否将设备设置为混杂模式。to_ms 超时时间。errbuf 为错误信息描述字符。返回值为cap_t类型的指针,pcap_t 定义是:typedef struct pcap pcap_t;struct pcap {/*typedef int (*read_op_t)(pcap_t *, int cnt, pcap_handler, u_char *);read_op为从网络接口读取报文的函数指针,待其得到赋值后,调用实现函数*/    read_op_t read_op; //从文件里读取报文的函数指针    int (*next_packet_op)(pcap_t *, struct pcap_pkthdr *, u_char **);//文件描述符,即socket    int fd;    int selectable_fd;       int bufsize;    //read缓冲区大小    u_char *buffer; //read缓冲区指针    u_char *bp;    int cc;...    int snapshot;    int linktype;       /* Network linktype */    int linktype_ext;          int tzoff;      /* timezone offset */    int offset;     /* offset for proper alignment */    int activated;      /* true if the capture is really started */    int oldstyle;       /* if we're opening with pcap_open_live() */    struct pcap_opt opt;     u_char *pkt;...   //激活函数,激活函数在得到调用后,会建立起与底层IPC的socket    activate_op_t activate_op;...};pcap_t *pcap_open_live(const char *source, int snaplen, int promisc, int to_ms, char *errbuf){       pcap_t *p;    int status;
   //创建捕获报文的接口句柄
    p = pcap_create(source, errbuf);    if (p == NULL)        return (NULL);    //设置最大报文长度    status = pcap_set_snaplen(p, snaplen);    if (status < 0)        goto fail;//将设备设为混杂模式    status = pcap_set_promisc(p, promisc);    if (status < 0)        goto fail;//设置超时时间    status = pcap_set_timeout(p, to_ms);    if (status < 0)        goto fail;    p->oldstyle = 1;//pcap_avtivate调用pcap_t的activate_op, 建立起与底层IPC通道    status = pcap_activate(p);    if (status < 0)        goto fail;    return (p);...}pcap_t *pcap_create(const char *source, char *errbuf){       size_t i;    int is_theirs;    pcap_t *p;    if (source == NULL)        source = "any";//在capture_source_types数组里寻找是否有特定API集合的接口对应source    for (i = 0; capture_source_types[i].create_op != NULL; i++) {        is_theirs = 0;        p = capture_source_types[i].create_op(source, errbuf, &is_theirs);        if (is_theirs) {                return (p);        }    }    //如果没有, 那么就将source作为普通网络接口    return (pcap_create_interface(source, errbuf));}pcap_create_interface() 函数在libpcap下有多个实现,可由编译宏来指定特定的pcap_create_interface来初始化read_op等函数指针。linux环境里默认是libpcap/pcap-linux.c中的 pcap_create_interface():pcap_t *pcap_create_interface(const char *device, char *ebuf){      pcap_t *handle;    /*可将 pcap_create_common看做pcap_t结构的构造函数,初始化一个pcap_t*/    handle = pcap_create_common(device, ebuf, sizeof (struct pcap_linux));    if (handle == NULL)        return NULL;    //为pcap_t 的激活函数指针填充具体实现函数    handle->activate_op = pcap_activate_linux;    handle->can_set_rfmon_op = pcap_can_set_rfmon_linux;       return handle;}完成后回到pcap_open_live,设置snaplen,promisc,to_ms后,调用status = pcap_activate(p),该函数执行status = p->activate_op(p) ,进而调用 pcap_activate_linux(), 完成read_op等重要函数指针的具体赋值。static int pcap_activate_linux(pcap_t *handle){       struct pcap_linux *handlep = handle->priv;    const char  *device;    int     status = 0;        device = handle->opt.source;        handle->inject_op = pcap_inject_linux;    handle->setfilter_op = pcap_setfilter_linux;    handle->setdirection_op = pcap_setdirection_linux;    handle->set_datalink_op = pcap_set_datalink_linux;    handle->getnonblock_op = pcap_getnonblock_fd;    handle->setnonblock_op = pcap_setnonblock_fd;    handle->cleanup_op = pcap_cleanup_linux;//最重要的函数指针read_op    handle->read_op = pcap_read_linux;    handle->stats_op = pcap_stats_linux;    if (strcmp(device, "any") == 0) {        if (handle->opt.promisc) {            handle->opt.promisc = 0;            /* Just a warning. */            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,                "Promiscuous mode not supported on the \"any\" device");            status = PCAP_WARNING_PROMISC_NOTSUP;        }    }      handlep->device = strdup(device);    if (handlep->device == NULL) {        snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "strdup: %s",             pcap_strerror(errno) );        return PCAP_ERROR;    }        handlep->timeout = handle->opt.timeout;    if (handle->opt.promisc)        handlep->proc_dropped = linux_if_drops(handlep->device);    //先使用activete_new()    status = activate_new(handle);    if (status < 0) {        goto fail;    }    //根据错误值具体处理    if (status == 1) {        switch (activate_mmap(handle, &status)) {        case 1:            return status;        case 0:           break;            case -1:            goto fail;        }    }//如果status为0, 再尝试使用activete_old()函数    else if (status == 0) {        /* Non-fatal error; try old way */        if ((status = activate_old(handle)) != 1) {            goto fail;        }    }    status = 0;    if (handle->opt.buffer_size != 0) {        //设置socket的缓冲区和缓冲区长度        if (setsockopt(handle->fd, SOL_SOCKET, SO_RCVBUF,            &handle->opt.buffer_size,            sizeof(handle->opt.buffer_size)) == -1) {            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,                 "SO_RCVBUF: %s", pcap_strerror(errno));            status = PCAP_ERROR;            goto fail;        }    }     handle->selectable_fd = handle->fd;       return status;...}     static intactivate_new(pcap_t *handle){   struct pcap_linux *handlep = handle->priv;    const char  *device = handle->opt.source;    int         is_any_device = (strcmp(device, "any") == 0);    int         sock_fd = -1, arptype;    int         err = 0;    struct packet_mreq  mr;/*指定网口情况下用PF_PACKET协议通信得到原始以太网数据帧数据关于socket()函数,我个人认为可以将其理解为open(): open()打开不同的文件,这样在返回的句柄里就可使用这个文件设备模块提供的ops,socket()打开不同的协议,返回句柄里也包括了该协议的底层模块提供的ops. 只不过linux下面没法将网络协议当作普通文件(如/dev/xx)处理,所以才有了另一套socket特定的APIs*/    sock_fd = is_any_device ?        socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL)) :        socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));...    handlep->sock_packet = 0;    /*iface_get_id()使用ioctl(fd, SIOCGIFINDEX, &ifr)获取lo还回设备的索引值*/    handlep->lo_ifindex = iface_get_id(sock_fd, "lo", handle->errbuf);        handle->offset   = 0;        if (!is_any_device) {        handlep->cooked = 0;            if (handle->opt.rfmon) {            err = enter_rfmon_mode(handle, sock_fd, device);            if (err < 0) {                close(sock_fd);                return err;            }            if (err == 0) {                close(sock_fd);                return PCAP_ERROR_RFMON_NOTSUP;            }            if (handlep->mondevice != NULL)                device = handlep->mondevice;        }/*iface_get_arptype()调用ioctl(fd, SIOCGIFHWADDR, &ifr)获取硬件类型 */        arptype = iface_get_arptype(sock_fd, device, handle->errbuf);        if (arptype < 0) {            close(sock_fd);            return arptype;        }        map_arphrd_to_dlt(handle, arptype, 1);  ...          //获取指定设备的索引值        handlep->ifindex = iface_get_id(sock_fd, device, handle->errbuf);        if (handlep->ifindex == -1) {            close(sock_fd);            return PCAP_ERROR;/*iface_bind()将设备的索引值作为struct socketadd_ll的索引值与socket绑定    struct sockaddr_ll  sll;     sll.sll_family      = AF_PACKET;                                                                   sll.sll_ifindex     = ifindex;    sll.sll_protocol    = htons(ETH_P_ALL);bind(fd, (struct sockaddr *) &sll, sizeof(sll)) == -1 */        if ((err = iface_bind(sock_fd, handlep->ifindex,handle->errbuf)) != 1) {                close(sock_fd);            if (err < 0)                return err;            else                return 0;   /* try old mechanism */        }...    }    if (!is_any_device && handle->opt.promisc) {        memset(&mr, 0, sizeof(mr));        mr.mr_ifindex = handlep->ifindex;        mr.mr_type    = PACKET_MR_PROMISC;        if (setsockopt(sock_fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP,            &mr, sizeof(mr)) == -1) {            snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,                "setsockopt: %s", pcap_strerror(errno));            close(sock_fd);            return PCAP_ERROR;        }    }    if (handlep->cooked) {        if (handle->snapshot < SLL_HDR_LEN + 1)            handle->snapshot = SLL_HDR_LEN + 1;    }    handle->bufsize = handle->snapshot;        //根据以太网链路层类型决定VLAN Tag在报文中的偏移值    switch (handle->linktype) {        case DLT_EN10MB:        handlep->vlan_offset = 2 * ETH_ALEN;        break;        case DLT_LINUX_SLL:        handlep->vlan_offset = 14;        break;        default:        handlep->vlan_offset = -1; /* unknown */        break;    }    //将sock_fd作为pcap_t的fd    handle->fd = sock_fd;...}至此,通过pcap_open_live完成全部准备阶段的内容, 之后就可以使用pcap_loop()来获取来自底层的数据并提交给callback函数进行应用处理, tcpdump.c 对pcap_loop的使用是: status = pcap_loop(pd, cnt, callback, pcap_userdata); //cnt 为指定捕获报文的个数在libpcap/pcap.c里有pcap_loop的定义:    int pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user){       register int n;        for (;;) {        if (p->rfile != NULL) {//从文件里读取报文            n = pcap_offline_read(p, cnt, callback, user);        } else {//从指定网口读取报文            do {//read_op即为pcap_read_packet                n = p->read_op(p, cnt, callback, user);            } while (n == 0);        }        //当n<0时退出循环,退出pcap_loop        if (n <= 0)            return (n);        //如果达到捕获报文个数,退出pcap_loop        if (cnt > 0) {            cnt -= n;            if (cnt <= 0)                return (0);        }    }}   函数指针read_op指向的就是pcap_read_packetstatic int               pcap_read_packet(pcap_t *handle, pcap_handler callback, u_char *userdata{     struct pcap_linux   *handlep = handle->priv;    u_char          *bp;//报文原始内容缓存    int   offset; 
        struct sockaddr_ll     from;//socket信息
socklen_t fromlen;//from的大小 int packet_len, caplen;//报文长度与捕获报文的长度 struct pcap_pkthdr pcap_header;//捕获报文的情况offset = 0; //初始化bp指针 ,让bp指向pcap_t里的read缓存区bp = handle->buffer + handle->offset; do { if (handle->break_loop) { handle->break_loop = 0; return PCAP_ERROR_BREAK; } fromlen = sizeof(from);//从socket接受信息存入bp指向的缓存区, 每次最大数据bufize,MSG_TRUNC为返回包的实际长度 packet_len = recvfrom(handle->fd, bp + offset,handle->bufsize - offset, MSG_TRUNC,(struct sockaddr *) &from, &fromlen); } while (packet_len == -1 && errno == EINTR);...caplen = packet_len; if (caplen > handle->snapshot) caplen = handle->snapshot;//捕获报文时的信息 pcap_header.caplen = caplen; pcap_header.len = packet_len; handlep->packets_read++; //将数据内容bp交给函数指针callback指向的函数处理 callback(userdata, &pcap_header, bp); return 1; } Linux内核对recvfrm 的响应:net/socket.cSYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, unsigned, flags, struct sockaddr __user *, addr, int __user *, addr_len){ struct socket *sock; struct iovec iov; struct msghdr msg; struct sockaddr_storage address; int err, err2; int fput_needed; if (size > INT_MAX) size = INT_MAX; //iov的最大缓存大小为#define INT_MAX ((int)(~0U>>1)) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_iovlen = 1; //将iov作为msg的缓存区数据结构,使得iov可以跟随这msg一起作为参数传递下去 msg.msg_iov = &iov; /*iov的base地址为用户层ercvfrom的bp+ offset,该缓存地址的大小赋值给iov的iov_len在内核层使用copy_to_user()将数据拷贝到iov这里,即可完成数据对用户层的传递*/ iov.iov_len = size; iov.iov_base = ubuf; //将msg_name指针指向address, 后面调用中,为msg_name赋值时address便得到赋值 msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = sizeof(address); if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT;/*用户层的调用 packet_len = recvfrom(handle->fd, bp + offset,handle->bufsize - offset, MSG_TRUNC,(struct sockaddr *) &from, &fromlen); 在这里分两部分分别回应,先使用sock_recvmsg() 对报文内容缓存区的赋值*/ err = sock_recvmsg(sock, &msg, size, flags); //对recvform()里from和fromlen的赋值,此时address已得到赋值 if (err >= 0 && addr != NULL) { err2 = move_addr_to_user((struct sockaddr *)&address, msg.msg_namelen, addr, addr_len); if (err2 < 0) err = err2; } ...}sock_revmsg()会调用sock里的函数指针集合ops里的recvmsg,这个函数指针在不同的模块下有不同的实现函数:int sock_recvmsg(struct socket *sock, struct msghdr *msg,size_t size, int flags){ ... ret = __sock_recvmsg(&iocb, sock, msg, size, flags);...} static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags){ int err; struct sock_iocb *si = kiocb_to_siocb(iocb); si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; si->flags = flags; err = security_socket_recvmsg(sock, msg, size, flags); if (err) return err; return sock->ops->recvmsg(iocb, sock, msg, size, flags);} 由于activate_new()里面建立了 PF_PACKET协议的socket, 所以,linux会调用建立PF_PACKET的底层模块af_packet来响应recvmsg。 在linux启动阶段,af_packet模块初始化完成后,会填充ops->recvmsg等函数指针,对上层/net/sock完成接口对接。net/packet/af_packet.cstatic int __init packet_init(void){ int rc = proto_register(&packet_proto, 0); /* sock_register里的printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); 会在linux启动阶段显示:NET: Registered protocol family 17 */ sock_register(&packet_family_ops); register_pernet_subsys(&packet_net_ops); register_netdevice_notifier(&packet_netdev_notifier);}static struct net_proto_family packet_family_ops = {//PF_PACKET即AF_PACKET,数值为17 .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE,};static int packet_create(struct net *net, struct socket *sock, int protocol){ struct sock *sk; ... sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; //为socket的ops指针集合填充实现函数。完成接口对接。 sock->ops = &packet_ops; ... return 0;} 在packet_ops里有对struct sock的函数指针recvmsg填充实现函数packet_recvmsgstatic const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, .sendpage = sock_no_sendpage,}; packet_recvmsg 封装了接受报文并并将数据拷贝到用户层全部动作:static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags){ struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; struct sockaddr_ll *sll; ... //第一步,从skb接收队列里取得数据交给skb缓存 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out; ... copied = skb->len; if (copied > len) { copied = len; //如果用户指定拷贝指定长度数据,按指定长度拷贝 msg->msg_flags |= MSG_TRUNC; } //第二步, 将获取到的数据skb拷贝到iov里,即完成数据对用户层的传递 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (err) goto out_free; sock_recv_timestamp(msg, sk, skb); /*将skb里的cb拷贝给msg->msg_name, 这样在net/socket.c的move_addr_to_user((struct sockaddr *)&address, msg.msg_namelen, addr, addr_len);就可以将此msg_name 传给用户层 。*/ if (msg->msg_name) memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,msg->msg_namelen); if (pkt_sk(sk)->auxdata) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; aux.tp_len = PACKET_SKB_CB(skb)->origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); aux.tp_vlan_tci = skb->vlan_tci; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); err = (flags&MSG_TRUNC) ? skb->len : copied;... return err;}net/core/datagram.cstruct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err){ int peeked; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, err);} __skb_recv_datagram的作用就是接收一个数据报缓存的数据结构,本文的分析就到__skb_recv_datagram从sk->sk_receive_queue 中取得skb结构数据为止,至于这个接收队列是由谁建立的,发送端在哪里,后续介绍。 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *err){ struct sk_buff *skb; long timeo; int error = sock_error(sk); if (error) goto no_packet; //阻塞模式下,获取sk的超时时间 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { unsigned long cpu_flags; //保证进程动作唯一,上spin锁 spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); //查看skb的*next指针时候有值,即是否有报文来到,有的话返回指针,没有返回NULL skb = skb_peek(&sk->sk_receive_queue); if (skb) { *peeked = skb->peeked; if (flags & MSG_PEEK) { skb->peeked = 1; //如果是MSG_PEEK动作的话就可以直接返回了 atomic_inc(&skb->users); } else//如果不是MSG_PEEK(查看动作)的话,那么在sk的接收队列中后移skb,即操作新的skb __skb_unlink(skb, &sk->sk_receive_queue); } //解spin锁 spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); //有数据的话返回数据的缓存 if (skb) return skb; /*如果peek时没有数据到到,在阻塞情况下,等待一定时间,当达到超时时间还没有接收到数据,向err传送错误类型报告,退出本函数; 在非阻塞情况下,timeo为0,直接报错后退出*/ error = -EAGAIN; if (!timeo) goto no_packet; //按照timeo的数值阻塞本进程,在timeo时间内持续执行do...while } while (!wait_for_packet(sk, err, &timeo)); return NULL; no_packet: *err = error; return NULL;}接收到skb后,调用skb_copy_datagram_iovec 将其拷贝到msg的iov里struct iovec{ void __user *iov_base; //缓存的首地址 __kernel_size_t iov_len; //缓存可用的大小}; int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to, int len){ //报文头部长度 int start = skb_headlen(skb); int i, copy = start – offset; struct sk_buff *frag_iter; trace_skb_copy_datagram_iovec(skb, len); //复制报文头部 if (copy > 0) { if (copy > len) copy = len; //将skb的copy长度(报文头部)的数据缓存复制到iov里,完成对用户层数据的传递 if (memcpy_toiovec(to, skb->data + offset, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy;}... //对于分片报文的处理,暂时不关心此细节 skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; //递归调用skb_copy_datagram_iovec,offset-start表示当前分片报文的长度 if (skb_copy_datagram_iovec(frag_iter,offset - start,to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; fault: return -EFAULT;}int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len){ while (len > 0) { if (iov->iov_len) {/*如果iov的iov_len大于len, 说明iov的缓存区还可以接受数据,那么设置本次拷贝大小为len,否则为copy为缓存还可以接收数据的长度*/ int copy = min_t(unsigned int, iov->iov_len, len); //将kdata拷贝到iov的base地址,长度为len,即将数据拷贝到用户层 if (copy_to_user(iov->iov_base, kdata, copy)) return -EFAULT;//每次拷贝后,kdata地址后移copy长度 kdata += copy; len -= copy;//每次拷贝后, 将iov_len减去已经使用的长度 iov->iov_len -= copy; //每次拷贝后,移动iov的base地址 iov->iov_base += copy; } iov++; } return 0;}总结:pcap_open_live 调用pcap_create()来为pcap_t填充read_op等函数指针,并提供了激活函数pcap_activate_linux,建立了socket与linux底层模块af_packet通信。 pcap_loop 调用了read_op的实现函数 pcap_read_linux, pcap_read_linux 里面使用了recvfrom 获取以太网原始数据,linux的af_packet模块会响应并完成recvfrom动作;recvfrom完成后调用callback指向的函数处理这些数据,callback指针的赋值是在tcpdump里根据具体链路层环境赋值的。欢迎大家交流,不足之处请不吝指正,给予批评!

热点排行