From mboxrd@z Thu Jan 1 00:00:00 1970 From: Rasesh Mody Subject: [PATCH v4 29/62] net/qede/base: optimize cache-line access Date: Mon, 27 Mar 2017 23:51:59 -0700 Message-ID: <1490683952-24919-30-git-send-email-rasesh.mody@cavium.com> References: <798af029-9a26-9065-350b-48781c1d3c55@intel.com> Mime-Version: 1.0 Content-Type: text/plain Cc: Rasesh Mody To: , Return-path: Received: from mx0b-0016ce01.pphosted.com (mx0a-0016ce01.pphosted.com [67.231.148.157]) by dpdk.org (Postfix) with ESMTP id A131837B1 for ; Tue, 28 Mar 2017 08:54:56 +0200 (CEST) In-Reply-To: <798af029-9a26-9065-350b-48781c1d3c55@intel.com> List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Optimize cache-line access in ecore_chain - re-arrange fields so that fields that are needed for fastpath [mostly produce/consume and their derivatives] are in the first cache line, and the rest are in the second. This is true for both PBL and NEXT_PTR kind of chains. Advancing a page in a SINGLE_PAGE chain would still require the 2nd cacheline as well, but afaik only SPQ uses it and so it isn't considered as 'fastpath'. Signed-off-by: Rasesh Mody --- drivers/net/qede/base/ecore_chain.h | 143 ++++++++++++++++------------- drivers/net/qede/base/ecore_dev.c | 14 +-- drivers/net/qede/base/ecore_sp_commands.c | 4 +- 3 files changed, 89 insertions(+), 72 deletions(-) diff --git a/drivers/net/qede/base/ecore_chain.h b/drivers/net/qede/base/ecore_chain.h index 61e39b5..ba272a9 100644 --- a/drivers/net/qede/base/ecore_chain.h +++ b/drivers/net/qede/base/ecore_chain.h @@ -59,25 +59,6 @@ struct ecore_chain_ext_pbl { void *p_pbl_virt; }; -struct ecore_chain_pbl { - /* Base address of a pre-allocated buffer for pbl */ - dma_addr_t p_phys_table; - void *p_virt_table; - - /* Table for keeping the virtual addresses of the chain pages, - * respectively to the physical addresses in the pbl table. - */ - void **pp_virt_addr_tbl; - - /* Index to current used page by producer/consumer */ - union { - struct ecore_chain_pbl_u16 pbl16; - struct ecore_chain_pbl_u32 pbl32; - } u; - - bool external; -}; - struct ecore_chain_u16 { /* Cyclic index of next element to produce/consme */ u16 prod_idx; @@ -91,40 +72,75 @@ struct ecore_chain_u32 { }; struct ecore_chain { - /* Address of first page of the chain */ - void *p_virt_addr; - dma_addr_t p_phys_addr; - + /* fastpath portion of the chain - required for commands such + * as produce / consume. + */ /* Point to next element to produce/consume */ void *p_prod_elem; void *p_cons_elem; - enum ecore_chain_mode mode; - enum ecore_chain_use_mode intended_use; + /* Fastpath portions of the PBL [if exists] */ + + struct { + /* Table for keeping the virtual addresses of the chain pages, + * respectively to the physical addresses in the pbl table. + */ + void **pp_virt_addr_tbl; + + union { + struct ecore_chain_pbl_u16 u16; + struct ecore_chain_pbl_u32 u32; + } c; + } pbl; - enum ecore_chain_cnt_type cnt_type; union { struct ecore_chain_u16 chain16; struct ecore_chain_u32 chain32; } u; - u32 page_cnt; + /* Capacity counts only usable elements */ + u32 capacity; + u32 page_cnt; - /* Number of elements - capacity is for usable elements only, - * while size will contain total number of elements [for entire chain]. + /* A u8 would suffice for mode, but it would save as a lot of headaches + * on castings & defaults. */ - u32 capacity; - u32 size; + enum ecore_chain_mode mode; /* Elements information for fast calculations */ u16 elem_per_page; u16 elem_per_page_mask; - u16 elem_unusable; - u16 usable_per_page; u16 elem_size; u16 next_page_mask; + u16 usable_per_page; + u8 elem_unusable; - struct ecore_chain_pbl pbl; + u8 cnt_type; + + /* Slowpath of the chain - required for initialization and destruction, + * but isn't involved in regular functionality. + */ + + /* Base address of a pre-allocated buffer for pbl */ + struct { + dma_addr_t p_phys_table; + void *p_virt_table; + } pbl_sp; + + /* Address of first page of the chain - the address is required + * for fastpath operation [consume/produce] but only for the the SINGLE + * flavour which isn't considered fastpath [== SPQ]. + */ + void *p_virt_addr; + dma_addr_t p_phys_addr; + + /* Total number of elements [for entire chain] */ + u32 size; + + u8 intended_use; + + /* TBD - do we really need this? Couldn't find usage for it */ + bool b_external_pbl; void *dp_ctx; }; @@ -135,8 +151,8 @@ struct ecore_chain { #define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode) \ ((mode == ECORE_CHAIN_MODE_NEXT_PTR) ? \ - (1 + ((sizeof(struct ecore_chain_next) - 1) / \ - (elem_size))) : 0) + (u8)(1 + ((sizeof(struct ecore_chain_next) - 1) / \ + (elem_size))) : 0) #define USABLE_ELEMS_PER_PAGE(elem_size, mode) \ ((u32)(ELEMS_PER_PAGE(elem_size) - \ @@ -245,7 +261,7 @@ u16 ecore_chain_get_usable_per_page(struct ecore_chain *p_chain) } static OSAL_INLINE -u16 ecore_chain_get_unusable_per_page(struct ecore_chain *p_chain) +u8 ecore_chain_get_unusable_per_page(struct ecore_chain *p_chain) { return p_chain->elem_unusable; } @@ -263,7 +279,7 @@ static OSAL_INLINE u32 ecore_chain_get_page_cnt(struct ecore_chain *p_chain) static OSAL_INLINE dma_addr_t ecore_chain_get_pbl_phys(struct ecore_chain *p_chain) { - return p_chain->pbl.p_phys_table; + return p_chain->pbl_sp.p_phys_table; } /** @@ -288,9 +304,9 @@ ecore_chain_advance_page(struct ecore_chain *p_chain, void **p_next_elem, p_next = (struct ecore_chain_next *)(*p_next_elem); *p_next_elem = p_next->next_virt; if (is_chain_u16(p_chain)) - *(u16 *)idx_to_inc += p_chain->elem_unusable; + *(u16 *)idx_to_inc += (u16)p_chain->elem_unusable; else - *(u32 *)idx_to_inc += p_chain->elem_unusable; + *(u32 *)idx_to_inc += (u16)p_chain->elem_unusable; break; case ECORE_CHAIN_MODE_SINGLE: *p_next_elem = p_chain->p_virt_addr; @@ -391,7 +407,7 @@ static OSAL_INLINE void *ecore_chain_produce(struct ecore_chain *p_chain) if ((p_chain->u.chain16.prod_idx & p_chain->elem_per_page_mask) == p_chain->next_page_mask) { p_prod_idx = &p_chain->u.chain16.prod_idx; - p_prod_page_idx = &p_chain->pbl.u.pbl16.prod_page_idx; + p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx; ecore_chain_advance_page(p_chain, &p_chain->p_prod_elem, p_prod_idx, p_prod_page_idx); } @@ -400,7 +416,7 @@ static OSAL_INLINE void *ecore_chain_produce(struct ecore_chain *p_chain) if ((p_chain->u.chain32.prod_idx & p_chain->elem_per_page_mask) == p_chain->next_page_mask) { p_prod_idx = &p_chain->u.chain32.prod_idx; - p_prod_page_idx = &p_chain->pbl.u.pbl32.prod_page_idx; + p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx; ecore_chain_advance_page(p_chain, &p_chain->p_prod_elem, p_prod_idx, p_prod_page_idx); } @@ -465,7 +481,7 @@ static OSAL_INLINE void *ecore_chain_consume(struct ecore_chain *p_chain) if ((p_chain->u.chain16.cons_idx & p_chain->elem_per_page_mask) == p_chain->next_page_mask) { p_cons_idx = &p_chain->u.chain16.cons_idx; - p_cons_page_idx = &p_chain->pbl.u.pbl16.cons_page_idx; + p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx; ecore_chain_advance_page(p_chain, &p_chain->p_cons_elem, p_cons_idx, p_cons_page_idx); } @@ -474,7 +490,7 @@ static OSAL_INLINE void *ecore_chain_consume(struct ecore_chain *p_chain) if ((p_chain->u.chain32.cons_idx & p_chain->elem_per_page_mask) == p_chain->next_page_mask) { p_cons_idx = &p_chain->u.chain32.cons_idx; - p_cons_page_idx = &p_chain->pbl.u.pbl32.cons_page_idx; + p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx; ecore_chain_advance_page(p_chain, &p_chain->p_cons_elem, p_cons_idx, p_cons_page_idx); } @@ -518,25 +534,26 @@ static OSAL_INLINE void ecore_chain_reset(struct ecore_chain *p_chain) u32 reset_val = p_chain->page_cnt - 1; if (is_chain_u16(p_chain)) { - p_chain->pbl.u.pbl16.prod_page_idx = (u16)reset_val; - p_chain->pbl.u.pbl16.cons_page_idx = (u16)reset_val; + p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val; + p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val; } else { - p_chain->pbl.u.pbl32.prod_page_idx = reset_val; - p_chain->pbl.u.pbl32.cons_page_idx = reset_val; + p_chain->pbl.c.u32.prod_page_idx = reset_val; + p_chain->pbl.c.u32.cons_page_idx = reset_val; } } switch (p_chain->intended_use) { - case ECORE_CHAIN_USE_TO_CONSUME_PRODUCE: - case ECORE_CHAIN_USE_TO_PRODUCE: - /* Do nothing */ - break; - case ECORE_CHAIN_USE_TO_CONSUME: - /* produce empty elements */ - for (i = 0; i < p_chain->capacity; i++) + /* produce empty elements */ + for (i = 0; i < p_chain->capacity; i++) ecore_chain_recycle_consumed(p_chain); - break; + break; + + case ECORE_CHAIN_USE_TO_CONSUME_PRODUCE: + case ECORE_CHAIN_USE_TO_PRODUCE: + default: + /* Do nothing */ + break; } } @@ -563,9 +580,9 @@ ecore_chain_init_params(struct ecore_chain *p_chain, u32 page_cnt, u8 elem_size, p_chain->p_virt_addr = OSAL_NULL; p_chain->p_phys_addr = 0; p_chain->elem_size = elem_size; - p_chain->intended_use = intended_use; + p_chain->intended_use = (u8)intended_use; p_chain->mode = mode; - p_chain->cnt_type = cnt_type; + p_chain->cnt_type = (u8)cnt_type; p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size); p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode); @@ -577,9 +594,9 @@ ecore_chain_init_params(struct ecore_chain *p_chain, u32 page_cnt, u8 elem_size, p_chain->page_cnt = page_cnt; p_chain->capacity = p_chain->usable_per_page * page_cnt; p_chain->size = p_chain->elem_per_page * page_cnt; - p_chain->pbl.external = false; - p_chain->pbl.p_phys_table = 0; - p_chain->pbl.p_virt_table = OSAL_NULL; + p_chain->b_external_pbl = false; + p_chain->pbl_sp.p_phys_table = 0; + p_chain->pbl_sp.p_virt_table = OSAL_NULL; p_chain->pbl.pp_virt_addr_tbl = OSAL_NULL; p_chain->dp_ctx = dp_ctx; @@ -623,8 +640,8 @@ static OSAL_INLINE void ecore_chain_init_pbl_mem(struct ecore_chain *p_chain, dma_addr_t p_phys_pbl, void **pp_virt_addr_tbl) { - p_chain->pbl.p_phys_table = p_phys_pbl; - p_chain->pbl.p_virt_table = p_virt_pbl; + p_chain->pbl_sp.p_phys_table = p_phys_pbl; + p_chain->pbl_sp.p_virt_table = p_virt_pbl; p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl; } diff --git a/drivers/net/qede/base/ecore_dev.c b/drivers/net/qede/base/ecore_dev.c index c895656..1c08d4a 100644 --- a/drivers/net/qede/base/ecore_dev.c +++ b/drivers/net/qede/base/ecore_dev.c @@ -3559,13 +3559,13 @@ static void ecore_chain_free_pbl(struct ecore_dev *p_dev, struct ecore_chain *p_chain) { void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl; - u8 *p_pbl_virt = (u8 *)p_chain->pbl.p_virt_table; + u8 *p_pbl_virt = (u8 *)p_chain->pbl_sp.p_virt_table; u32 page_cnt = p_chain->page_cnt, i, pbl_size; if (!pp_virt_addr_tbl) return; - if (!p_chain->pbl.p_virt_table) + if (!p_pbl_virt) goto out; for (i = 0; i < page_cnt; i++) { @@ -3581,10 +3581,10 @@ static void ecore_chain_free_pbl(struct ecore_dev *p_dev, pbl_size = page_cnt * ECORE_CHAIN_PBL_ENTRY_SIZE; - if (!p_chain->pbl.external) - OSAL_DMA_FREE_COHERENT(p_dev, p_chain->pbl.p_virt_table, - p_chain->pbl.p_phys_table, pbl_size); -out: + if (!p_chain->b_external_pbl) + OSAL_DMA_FREE_COHERENT(p_dev, p_chain->pbl_sp.p_virt_table, + p_chain->pbl_sp.p_phys_table, pbl_size); + out: OSAL_VFREE(p_dev, p_chain->pbl.pp_virt_addr_tbl); } @@ -3716,7 +3716,7 @@ ecore_chain_alloc_pbl(struct ecore_dev *p_dev, } else { p_pbl_virt = ext_pbl->p_pbl_virt; p_pbl_phys = ext_pbl->p_pbl_phys; - p_chain->pbl.external = true; + p_chain->b_external_pbl = true; } ecore_chain_init_pbl_mem(p_chain, p_pbl_virt, p_pbl_phys, diff --git a/drivers/net/qede/base/ecore_sp_commands.c b/drivers/net/qede/base/ecore_sp_commands.c index 23ebab7..b831970 100644 --- a/drivers/net/qede/base/ecore_sp_commands.c +++ b/drivers/net/qede/base/ecore_sp_commands.c @@ -379,11 +379,11 @@ enum _ecore_status_t ecore_sp_pf_start(struct ecore_hwfn *p_hwfn, /* Place EQ address in RAMROD */ DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr, - p_hwfn->p_eq->chain.pbl.p_phys_table); + p_hwfn->p_eq->chain.pbl_sp.p_phys_table); page_cnt = (u8)ecore_chain_get_page_cnt(&p_hwfn->p_eq->chain); p_ramrod->event_ring_num_pages = page_cnt; DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr, - p_hwfn->p_consq->chain.pbl.p_phys_table); + p_hwfn->p_consq->chain.pbl_sp.p_phys_table); ecore_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config); -- 1.7.10.3