首页 > 解决方案 > 写入内核中的页面映射 dmas

问题描述

我一直在努力修改 intel ixgbe 内核驱动程序以与我的 PCIe 设备(FPGA 但这不是超级重要)一起工作。内核和 PCIe 设备都可以很好地协商,配置标头被传递并且通信似乎正常工作。但是尝试编写 DMA_FROM_DEVICE 我有一个我不明白的小问题,我希望能得到帮助。

rx_ring->desc = dma_alloc_coherent(dev, ///This function allocates dma space of size size for handle dma on device dev with flag GFP KERNEL
                       rx_ring->size,
                       &rx_ring->dma,   ///This dma handle may be cast to unsigned integer of the same bus width and given to dev as the DMA base address
                       GFP_KERNEL);

page = dev_alloc_pages(0);
dma = dma_map_page(rx_ring->dev, page, 0, acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);

//Writing to the PCI device the base address to place data into.     
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
//This will perfectly read data I place onto the PCIe bus.
rx_ring->desc->wb.upper.length

//This seems to read some garbage memory.
dma_sync_single_range_for_cpu(rx_ring->dev,
                      rx_buffer->dma,
                      rx_buffer->page_offset,
                      acc_rx_bufsz(rx_ring),
                      DMA_FROM_DEVICE);
unsigned char *va = page_address(page) + rx_buffer->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));

//Some code later
dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
                 new_buff->page_offset,
                 acc_rx_bufsz(rx_ring),
                 DMA_FROM_DEVICE);

我试图将代码清除到感兴趣的点,但这里是简短的。dma_alloc_coherent我为通过该函数创建虚拟地址和总线地址的 dma 分配空间。dev_alloc_pages我为 dma 创建了一个内存页面,并通过anddma_map_page命令将此页面映射到 dma 。我将 dma 总线地址传递给我的 PCIe 设备,以便它可以通过命令写入正确的偏移量writel(我知道 iowrite32,但这是在 redhat 上)。

从这里开始,原始 ixgbe 驱动程序有 2 种方式从 PCIe 总线读取数据。首先,它直接从 dma 分配的虚拟地址 (desc) 中读取,但这仅用于配置信息(在我正在使用的驱动程序中)。第二种方法是通过使用page_address(page)我相信获得内存页面的虚拟地址。问题是那里只有垃圾内存。

所以这是我的困惑。页面指​​向哪里以及如何通过 PCI 总线将数据放入页面?我认为这dma_map_page会将 2 个虚拟地址合并为 1 个,因此我对 dma 总线地址的写入会与页面发生冲突,但情况似乎并非如此。我的 PCI 设备应该从哪个基地址写入以对齐此内存页?

我正在研究 redhat,特别是 Centos 内核版本 3.10.0,这会导致一些问题,因为 redhat 内核与基本内核非常不同,但希望有人能提供帮助。感谢您的任何指点。

编辑:添加了我忘记包含在原始帖子中的 dma_sync 调用。

EDIT2:添加了更完整的代码库。作为说明,我仍然没有包括一些结构定义或顶级函数调用(例如探针),但希望这会更加完整。对不起,它有多长。

//These functions are called during configuration
int acc_setup_rx_resources(struct acc_ring *rx_ring)
{
    struct device *dev = rx_ring->dev;
    int orig_node = dev_to_node(dev);
    int numa_node = -1;
    int size;

    size = sizeof(struct acc_rx_buffer) * rx_ring->count;

    if (rx_ring->q_vector)
        numa_node = rx_ring->q_vector->numa_node;

    rx_ring->rx_buffer_info = vzalloc_node(size, numa_node);
    if (!rx_ring->rx_buffer_info)
        rx_ring->rx_buffer_info = vzalloc(size);
    if (!rx_ring->rx_buffer_info)
        goto err;

    /* Round up to nearest 4K */
    rx_ring->size = rx_ring->count * sizeof(union acc_adv_rx_desc);
    rx_ring->size = ALIGN(rx_ring->size, 4096);

    set_dev_node(dev, numa_node);
    rx_ring->desc = dma_alloc_coherent(dev, 
                       rx_ring->size,
                       &rx_ring->dma,   
                       GFP_KERNEL);
    set_dev_node(dev, orig_node);
    if (!rx_ring->desc)
        rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
                           &rx_ring->dma, GFP_KERNEL);
    if (!rx_ring->desc)
        goto err;

    rx_ring->next_to_clean = 0;
    rx_ring->next_to_use = 0;

    return 0;
err:
    vfree(rx_ring->rx_buffer_info);
    rx_ring->rx_buffer_info = NULL;
    dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
    return -ENOMEM;
}

static bool acc_alloc_mapped_page(struct acc_ring *rx_ring,
                    struct acc_rx_buffer *bi)
{
    struct page *page = bi->page;
    dma_addr_t dma = bi->dma;

    if (likely(page))
        return true;

    page = dev_alloc_pages(0);
    if(unlikely(!page)){
        rx_ring->rx_stats.alloc_rx_page_failed++;
        return false;
    }

    /* map page for use */
    dma = dma_map_page(rx_ring->dev, page, 0,
               acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);

    if (dma_mapping_error(rx_ring->dev, dma)) {
        __free_pages(page, acc_rx_pg_order(rx_ring));
        bi->page = NULL;

        rx_ring->rx_stats.alloc_rx_page_failed++;
        return false;
    }
    bi->dma = dma;
    bi->page = page; 
    bi->page_offset = 0;
    page_ref_add(page, USHRT_MAX - 1);  //This seems to exist in redhat kernel but not 3.10 base kernel... keep?

    return true;
}

void acc_alloc_rx_buffers(struct acc_ring *rx_ring, u16 cleaned_count)
{
    union acc_adv_rx_desc *rx_desc;
    struct acc_rx_buffer *bi;
    u16 i = rx_ring->next_to_use;   
    printk(KERN_INFO "acc Attempting to allocate rx buffers\n");

    /* nothing to do */
    if (!cleaned_count)
        return;

    rx_desc = ACC_RX_DESC(rx_ring, i);  
    bi = &rx_ring->rx_buffer_info[i];   
    i -= rx_ring->count;    

    do {
        if (!acc_alloc_mapped_page(rx_ring, bi)){
            printk(KERN_INFO "acc Failed to allocate and map the page to dma\n");
            break;
        }
        printk(KERN_INFO "acc happily allocated and mapped page to dma\n");

        /*
         * Refresh the desc even if buffer_addrs didn't change
         * because each write-back erases this info.
         */
        rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);

        rx_desc++;
        bi++;   ///Move to the next buffer
        i++;
        if (unlikely(!i)) {
            rx_desc = ACC_RX_DESC(rx_ring, 0);
            bi = rx_ring->rx_buffer_info;
            i -= rx_ring->count;
        }

        /* clear the hdr_addr for the next_to_use descriptor */
        rx_desc->read.hdr_addr = 0;

        cleaned_count--;
    } while (cleaned_count); 

    i += rx_ring->count;

    if (rx_ring->next_to_use != i)
        acc_release_rx_desc(rx_ring, i);
}

//This function is called via a napi_schedule command which fires when an MSI interrupt is thrown from my PCIe device (all works fine).
int acc_poll(struct napi_struct *napi, int budget)
{
    struct acc_q_vector *q_vector =
                container_of(napi, struct acc_q_vector, napi);
    struct acc_adapter *adapter = q_vector->adapter;
    struct acc_ring *ring;
    int per_ring_budget;
    bool clean_complete = true;

    e_dev_info("Landed in acc_poll\n");

    e_dev_info("Attempting to read register space 0x00=%x\t0x04=%x\n", \
        readl(q_vector->adapter->hw.hw_addr), readl(q_vector->adapter->hw.hw_addr+0x04));
    e_dev_info("Attempting to write to pci ctl\n");
    e_dev_info("Target address %.8x%.8x\n",q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF);
    e_dev_info("Attempted page address %.8x%.8x\n",virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF);
    writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr+ACC_PCI_IPCONT_DATA_OFFSET);  //These are supposed to be iowrite64 but it seems iowrite64 is different in redhat and only supports the copy function (to,from,size). yay redhat think different.

    writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
    writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);

    writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, q_vector->adapter->hw_region2.hw_addr+0x10+ACC_PCI_IPCONT_DATA_OFFSET);
    writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x14+ACC_PCI_IPCONT_DATA_OFFSET);

    writeq(0xFF00000000000000, q_vector->adapter->hw_region2.hw_addr+0x18+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x20+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x28+ACC_PCI_IPCONT_DATA_OFFSET);
    writeq(0x0003344000005500, q_vector->adapter->hw_region2.hw_addr+0x30+ACC_PCI_IPCONT_DATA_OFFSET);

    //Send the start command to the block
    writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr);


    acc_for_each_ring(ring, q_vector->tx)
        clean_complete &= !!acc_clean_tx_irq(q_vector, ring);

    if (q_vector->rx.count > 1)
        per_ring_budget = max(budget/q_vector->rx.count, 1);
    else
        per_ring_budget = budget;

    acc_for_each_ring(ring, q_vector->rx){
        e_dev_info("Calling clean_rx_irq\n");
        clean_complete &= acc_clean_rx_irq(q_vector, ring,  
                             per_ring_budget);
    }

    /* If all work not completed, return budget and keep polling */
    if (!clean_complete)
        return budget;

    e_dev_info("Clean complete\n");

    /* all work done, exit the polling mode */
    napi_complete(napi);
    if (adapter->rx_itr_setting & 1)
        acc_set_itr(q_vector);
    if (!test_bit(__ACC_DOWN, &adapter->state))
        acc_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));

    e_dev_info("Exiting acc_poll\n");

    return 0;
}

static bool acc_clean_rx_irq(struct acc_q_vector *q_vector,
                   struct acc_ring *rx_ring,
                   const int budget)
{
    printk(KERN_INFO "acc Entered clean_rx_irq\n");
    unsigned int total_rx_bytes = 0, total_rx_packets = 0;
    u16 cleaned_count = acc_desc_unused(rx_ring);   /// First pass this is count-1 because ntc and ntu are 0 so this is 512-1=511

    printk(KERN_INFO "acc RX irq Clean count = %d\n", cleaned_count);

    do {
        union acc_adv_rx_desc *rx_desc;
        struct sk_buff *skb;

        /* return some buffers to hardware, one at a time is too slow */
        if (cleaned_count >= ACC_RX_BUFFER_WRITE) { //When the clean count is >16 allocate some more buffers to get the clean count down. First pass this happens.
            acc_alloc_rx_buffers(rx_ring, cleaned_count);
            cleaned_count = 0;
        }

        rx_desc = ACC_RX_DESC(rx_ring, rx_ring->next_to_clean);

        printk(KERN_INFO "acc inside RX do while, acquired description\n");

        printk(KERN_INFO "acc Everything I can about the rx_ring desc (acc_rx_buffer). status_error=%d\t \
        length=%d\n", rx_desc->wb.upper.status_error, rx_desc->wb.upper.length);

        if (!acc_test_staterr(rx_desc, ACC_RXD_STAT_DD))
            break;

        printk(KERN_INFO "acc inside RX past status_error check\n");
        /*
         * This memory barrier is needed to keep us from reading
         * any other fields out of the rx_desc until we know the
         * RXD_STAT_DD bit is set
         */
        rmb();

        /* retrieve a buffer from the ring */
        skb = acc_fetch_rx_buffer(rx_ring, rx_desc);

        /* exit if we failed to retrieve a buffer */
        if (!skb)
            break;

        printk(KERN_INFO "acc successfully retrieved a buffer\n");

        cleaned_count++;

        /* place incomplete frames back on ring for completion */
        if (acc_is_non_eop(rx_ring, rx_desc, skb))
            continue;

        /* verify the packet layout is correct */
        if (acc_cleanup_headers(rx_ring, rx_desc, skb))
            continue;

        /* probably a little skewed due to removing CRC */
        total_rx_bytes += skb->len;

        /* populate checksum, timestamp, VLAN, and protocol */
        acc_process_skb_fields(rx_ring, rx_desc, skb);

        acc_rx_skb(q_vector, skb);  ///I believe this sends data to the kernel network stuff and then the generic OS

        /* update budget accounting */
        total_rx_packets++;
    } while (likely(total_rx_packets < budget));

    printk(KERN_INFO "acc rx irq exited the while loop\n");

    u64_stats_update_begin(&rx_ring->syncp);
    rx_ring->stats.packets += total_rx_packets;
    rx_ring->stats.bytes += total_rx_bytes;
    u64_stats_update_end(&rx_ring->syncp);
    q_vector->rx.total_packets += total_rx_packets;
    q_vector->rx.total_bytes += total_rx_bytes;

    if (cleaned_count)
        acc_alloc_rx_buffers(rx_ring, cleaned_count);

    printk(KERN_INFO "acc rx irq returning happily\n");

    return (total_rx_packets < budget);
}

static struct sk_buff *acc_fetch_rx_buffer(struct acc_ring *rx_ring,
                         union acc_adv_rx_desc *rx_desc)
{
    struct acc_rx_buffer *rx_buffer;
    struct sk_buff *skb;
    struct page *page;

    printk(KERN_INFO "acc Attempting to fetch rx buffer\n");

    rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
    page = rx_buffer->page; //This page is set by I think acc_add_rx_frag... hard to tell. yes the page is created there and kind of linked to the dma via dma_map_page
    prefetchw(page);    ///Prefetch the page cacheline for writing

    skb = rx_buffer->skb;   ///This does the mapping between skb and dma page table I believe.

    if (likely(!skb)) {
        printk(KERN_INFO "acc attempting to allocate netdrv space for page.\n");
        void *page_addr = page_address(page) +  //get the virtual page address of this page.
                  rx_buffer->page_offset;

        /* prefetch first cache line of first page */
        prefetch(page_addr);
#if L1_CACHE_BYTES < 128
        prefetch(page_addr + L1_CACHE_BYTES);
#endif

        /* allocate a skb to store the frags */
        skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
                        ACC_RX_HDR_SIZE);
        if (unlikely(!skb)) {
            rx_ring->rx_stats.alloc_rx_buff_failed++;
            return NULL;
        }

        /*
         * we will be copying header into skb->data in
         * pskb_may_pull so it is in our interest to prefetch
         * it now to avoid a possible cache miss
         */
        prefetchw(skb->data);

        /*
         * Delay unmapping of the first packet. It carries the
         * header information, HW may still access the header
         * after the writeback.  Only unmap it when EOP is
         * reached
         */
        if (likely((rx_desc, ACC_RXD_STAT_EOP)))
            goto dma_sync;

        ACC_CB(skb)->dma = rx_buffer->dma;
    } else {
        if (acc_test_staterr(rx_desc, ACC_RXD_STAT_EOP))
            acc_dma_sync_frag(rx_ring, skb);

dma_sync:
        /* we are reusing so sync this buffer for CPU use */
        printk(KERN_INFO "acc attempting to sync the dma and the device.\n");
        dma_sync_single_range_for_cpu(rx_ring->dev, //Sync to the pci device, this dma buffer, at this page offset, this ring, for device to DMA transfer
                          rx_buffer->dma,
                          rx_buffer->page_offset,
                          acc_rx_bufsz(rx_ring),
                          DMA_FROM_DEVICE);
    }

    /* pull page into skb */
    if (acc_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
        //This is again temporary to try and create blockers around the problem.
        return skb;
        /* hand second half of page back to the ring */
        acc_reuse_rx_page(rx_ring, rx_buffer);
    } else if (ACC_CB(skb)->dma == rx_buffer->dma) {
        /* the page has been released from the ring */
        ACC_CB(skb)->page_released = true;
    } else {
        /* we are not reusing the buffer so unmap it */
        dma_unmap_page(rx_ring->dev, rx_buffer->dma,
                   acc_rx_pg_size(rx_ring),
                   DMA_FROM_DEVICE);
    }

    /* clear contents of buffer_info */
    rx_buffer->skb = NULL;
    rx_buffer->dma = 0;
    rx_buffer->page = NULL;

    printk(KERN_INFO "acc returning from fetch_rx_buffer.\n");

    return skb;
}

static bool acc_add_rx_frag(struct acc_ring *rx_ring,
                  struct acc_rx_buffer *rx_buffer,
                  union acc_adv_rx_desc *rx_desc,
                  struct sk_buff *skb)
{
    printk(KERN_INFO "acc Attempting to add rx_frag from page.\n");
    struct page *page = rx_buffer->page;
    unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
#if (PAGE_SIZE < 8192)
    unsigned int truesize = acc_rx_bufsz(rx_ring);
#else
    unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
    unsigned int last_offset = acc_rx_pg_size(rx_ring) -
                   acc_rx_bufsz(rx_ring);
#endif

    if ((size <= ACC_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
        printk(KERN_INFO "acc Inside the size check.\n");
        unsigned char *va = page_address(page) + rx_buffer->page_offset;
        printk(KERN_INFO "page:%p\tpage_address:%p\tpage_offset:%d\n",page,page_address(page),rx_buffer->page_offset);
        printk(KERN_INFO "acc First 4 bytes of string:%x  %x  %x  %x\n",va[0],va[1],va[2],va[3]); //FIXME: I can now read this page table but there is still no meaningful data in it. (appear to be reading garbage)
        printk(KERN_INFO "acc 32 bytes in:%x %x %x %x\n",va[32],va[33],va[34],va[35]);
        return true;

        memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));    

        /* we can reuse buffer as-is, just make sure it is local */
        if (likely(page_to_nid(page) == numa_node_id()))
            return true;

        /* this page cannot be reused so discard it */
        put_page(page);
        return false;
    }

    skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
            rx_buffer->page_offset, size, truesize);

    /* avoid re-using remote pages */
    if (unlikely(page_to_nid(page) != numa_node_id()))
        return false;

#if (PAGE_SIZE < 8192)
    /* if we are only owner of page we can reuse it */
    if (unlikely(page_count(page) != 1))
        return false;

    /* flip page offset to other buffer */
    rx_buffer->page_offset ^= truesize;

    /*
     * since we are the only owner of the page and we need to
     * increment it, just set the value to 2 in order to avoid
     * an unecessary locked operation
     */
    atomic_set(&page->_count, 2);
#else
    /* move offset up to the next cache line */
    rx_buffer->page_offset += truesize;

    if (rx_buffer->page_offset > last_offset)
        return false;

    /* bump ref count on page before it is given to the stack */
    get_page(page);
#endif

    return true;
}

标签: clinux-kernelkernelredhatvivado

解决方案


推荐阅读