DPDK 17.11.1 - 執行基於目標的速率限制時看到的下降

Question

編輯問題陳述以突出核心邏輯的更多信息

在進行基於目的地的速率限制時，我們看到了性能問題。 我們為每個 {destination-src} 對（最多 100 個目的地和 2^16 個源）維護狀態。 我們有一個包含 100 個節點的數組，每個節點都有一個 rte_hash*。 這個哈希表將維護該目的地看到的每個源 ip 的狀態。 我們為所見的每個目的地（0 到 100）都有一個映射，這用於索引到數組中。 如果特定源在一秒鍾內超過為此目標定義的閾值，我們將阻止該源，否則我們允許該源。 在運行時，當我們只看到 2 或 3 個目的地的流量時，沒有問題，但是當我們超過 5 時，我們看到大量下降。 我們的函數必須進行查找並識別與 dest_ip 和 src_ip 匹配的流。 處理流並決定它是否需要丟棄。 如果未找到流，則將其添加到哈希中。

struct flow_state {
    struct rte_hash* hash;    
};

struct flow_state flow_state_arr[100];

// 我將在 pipeline_init 使用 rte_hash_create 創建這些哈希表，並在 pipeline_free 期間釋放它們。

我概述了我們用偽代碼做什么。

run()
{
    1) do rx
    2) from the pkt, get index into the flow_state_arr and retrieve the rte_hash* handle    
    3) rte_hash_lookup_data(hash, src_ip,flow_data)
    4) if entry found, take decision on the flow (the decision is simply say rate limiting the flow)
    5) else rte_hash_add_data(hash,src_ip,new_flow_data) to add the flow to table and forward
}

請指導我們是否可以在數據路徑中擁有這些多個哈希表對象，或者如果我們需要分別處理每個目的地的狀態，最好的方法是什么。

編輯
謝謝回答。 我很樂意分享代碼片段和我們收集的結果。 我沒有其他 DPDK 版本的比較結果，但以下是我們使用 17.11.1 進行測試的一些結果。

測試設置
我正在為 3 個目的地 14.143.156.x（在本例中為 101,102,103）使用 IXIA 流量生成器（使用兩個 10G 鏈接生成 12Mpps）。 每個目的地的流量來自 2^16 個不同的來源。 這是流量生成器設置。

代碼片段

    struct flow_state_t {
        struct rte_hash* hash;
        uint32_t size;
        uint64_t threshold;
    };
    struct flow_data_t {
        uint8_t curr_state; // 0 if blocked, 1 if allowed
        uint64_t pps_count;
        uint64_t src_first_seen;
    };
    struct pipeline_ratelimit {
        struct pipeline p;
        struct pipeline_ratelimit_params params;
        rte_table_hash_op_hash f_hash;
        uint32_t swap_field0_offset[SWAP_DIM];
        uint32_t swap_field1_offset[SWAP_DIM];
        uint64_t swap_field_mask[SWAP_DIM];
        uint32_t swap_n_fields;
        pipeline_msg_req_handler custom_handlers[2]; // handlers for add and del
        struct flow_state_t flow_state_arr[100];
        struct flow_data_t flows[100][65536];
    } __rte_cache_aligned;
    
    /*
      add_handler(pipeline,msg) -- msg includes index and threshold
      In the add handler
      a rule/ threshold is added for a destination
      rte_hash_create and store rte_hash* in flow_state_arr[index]
      max of 100 destinations or rules are allowed
      previous pipelines add the ID (index) to the packet to look in to the
      flow_state_arr for the rule
    */
    
    /*
      del_handler(pipeline,msg) -- msg includes index
      In the del handler
      a rule/ threshold @index is deleted
      the associated rte_hash* is also freed
      the slot is made free
    */
    
    #define ALLOWED 1
    #define BLOCKED 0
    #define TABLE_MAX_CAPACITY 65536
    int do_rate_limit(struct pipeline_ratelimit* ps, uint32_t id, unsigned char* pkt)
    {
        uint64_t curr_time_stamp = rte_get_timer_cycles();
        struct iphdr* iph = (struct iphdr*)pkt;
        uint32_t src_ip = rte_be_to_cpu_32(iph->saddr);
    
        struct flow_state_t* node = &ps->flow_state_arr[id];
        struct flow_data_t* flow = NULL
        rte_hash_lookup_data(node->hash, &src_ip, (void**)&flow);
        if (flow != NULL)
        {
            if (flow->curr_state == ALLOWED)
            {
                if (flow->pps_count++ > node->threshold)
                {
                    uint64_t seconds_elapsed = (curr_time_stamp - flow->src_first_seen) / CYCLES_IN_1SEC;
                    if (seconds_elapsed)
                    {
                        flow->src_first_seen += seconds_elapsed * CYCLES_IN_1_SEC;
                        flow->pps_count = 1;
                        return ALLOWED;
                    }
                    else
                    {
                        flow->pps_count = 0;
                        flow->curr_state = BLOCKED;
                        return BLOCKED;
                    }
                }
                return ALLOWED;
            }
            else
            {
                uint64_t seconds_elapsed = (curr_time_stamp - flow->src_first_seen) / CYCLES_IN_1SEC;
                if (seconds_elapsed > 120)
                {
                    flow->curr_state = ALLOWED;
                    flow->pps_count = 0;
                    flow->src_first_seen += seconds_elapsed * CYCLES_IN_1_SEC;
                    return ALLOWED;
                }
                return BLOCKED;
            }
        }
        int index = node->size;
        // If entry not found and we have reached capacity
        // Remove the rear element and mark it as the index for the new node    
        if (node->size == TABLE_MAX_CAPACITY)
        {
            rte_hash_reset(node->hash);
            index = node->size = 0;
        }
    
        // Add new element @packet_flows[mit_id][index]
        struct flow_data_t* flow_data = &ps->flows[id][index]; 
        *flow_data = { ALLOWED, 1, curr_time_stamp };
        node->size++;
    
        // Add the new key to hash
        rte_hash_add_key_data(node->hash, (void*)&src_ip, (void*)flow_data);    
        return ALLOWED;
    }
    static int pipeline_ratelimit_run(void* pipeline)
    {
        struct pipeline_ratelimit* ps = (struct pipeline_ratelimit*)pipeline;
    
        struct rte_port_in* port_in = p->port_in_next;
        struct rte_port_out* port_out = &p->ports_out[0];
        struct rte_port_out* port_drop = &p->ports_out[2];
    
        uint8_t valid_pkt_cnt = 0, invalid_pkt_cnt = 0;
        struct rte_mbuf* valid_pkts[RTE_PORT_IN_BURST_SIZE_MAX];
        struct rte_mbuf* invalid_pkts[RTE_PORT_IN_BURST_SIZE_MAX];
    
        memset(valid_pkts, 0, sizeof(valid_pkts));
        memset(invalid_pkts, 0, sizeof(invalid_pkts));
    
        uint64_t n_pkts;
    
        if (unlikely(port_in == NULL)) {
            return 0;
        }
    
        /* Input port RX */
        n_pkts = port_in->ops.f_rx(port_in->h_port, p->pkts,
            port_in->burst_size);
    
        if (n_pkts == 0)
        {
            p->port_in_next = port_in->next;
            return 0;
        }
    
        uint32_t rc = 0;
        char* rx_pkt = NULL;
    
        for (j = 0; j < n_pkts; j++) {
    
            struct rte_mbuf* m = p->pkts[j];
            rx_pkt = rte_pktmbuf_mtod(m, char*);
            uint32_t id = rte_be_to_cpu_32(*(uint32_t*)(rx_pkt - sizeof(uint32_t)));
            unsigned short packet_len = rte_be_to_cpu_16(*((unsigned short*)(rx_pkt + 16)));
    
            struct flow_state_t* node = &(ps->flow_state_arr[id]);
    
            if (node->hash && node->threshold != 0)
            {
                // Decide whether to allow of drop the packet
                // returns allow - 1, drop - 0
                if (do_rate_limit(ps, id, (unsigned char*)(rx_pkt + 14)))
                    valid_pkts[valid_pkt_count++] = m;
                else
                    invalid_pkts[invalid_pkt_count++] = m;
            }
            else
                valid_pkts[valid_pkt_count++] = m;
    
            if (invalid_pkt_cnt) {
                p->pkts_mask = 0;
                rte_memcpy(p->pkts, invalid_pkts, sizeof(invalid_pkts));
                p->pkts_mask = RTE_LEN2MASK(invalid_pkt_cnt, uint64_t);
                rte_pipeline_action_handler_port_bulk_mod(p, p->pkts_mask, port_drop);
            }
    
            p->pkts_mask = 0;
            memset(p->pkts, 0, sizeof(p->pkts));
    
            if (valid_pkt_cnt != 0)
            {
                rte_memcpy(p->pkts, valid_pkts, sizeof(valid_pkts));
                p->pkts_mask = RTE_LEN2MASK(valid_pkt_cnt, uint64_t);
            }
    
            rte_pipeline_action_handler_port_bulk_mod(p, p->pkts_mask, port_out);
    
            /* Pick candidate for next port IN to serve */
            p->port_in_next = port_in->next;
            return (int)n_pkts;
        }
}

結果

當閾值為 14Mpps 的 60000 個源僅為一個目的地生成流量時，沒有下降。 我們能夠從 IXIA 發送 12Mpps 並接收 12Mpps
添加 3 個或更多目的地（每個都配置為從 60000 個源接收流量）后觀察到下降。 吞吐量僅為 8-9 Mpps。 當發送到 100 個目的地（每個 60000 src）時，只處理了 6.4Mpps。 下降了 50%。
通過 vtune-profiler 運行它時，它將 rte_hash_lookup_data 報告為熱點，並且主要是內存限制（DRAM 限制）。 我會盡快附上 vtune 報告。

Answer 1

根據內部測試的更新， rte_hash庫不會導致性能下降。 因此，正如評論中所建議的，更可能是由於當前的模式和算法設計，這可能導致緩存未命中和每個周期較少的指令。

要確定是前端停頓還是后端管道停頓或內存停頓，請使用perf或vtune 。 還要盡量減少分支並使用更likely和prefetch 。

DPDK 17.11.1 - 執行基於目標的速率限制時看到的下降

問題描述

1 個解決方案

解決方案1
0 已采納 2020-09-09 02:08:28

DPDK 17.11.1 - 執行基於目標的速率限制時看到的下降

問題描述

1 個解決方案

解決方案1 0 已采納 2020-09-09 02:08:28

解決方案1
0 已采納 2020-09-09 02:08:28