diff options
Diffstat (limited to 'fpga/usrp3/lib/rfnoc/crossbar/chdr_crossbar_nxn.v')
| -rw-r--r-- | fpga/usrp3/lib/rfnoc/crossbar/chdr_crossbar_nxn.v | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/fpga/usrp3/lib/rfnoc/crossbar/chdr_crossbar_nxn.v b/fpga/usrp3/lib/rfnoc/crossbar/chdr_crossbar_nxn.v new file mode 100644 index 000000000..79f1a6626 --- /dev/null +++ b/fpga/usrp3/lib/rfnoc/crossbar/chdr_crossbar_nxn.v @@ -0,0 +1,381 @@ +// +// Copyright 2018 Ettus Research, A National Instruments Company +// +// SPDX-License-Identifier: LGPL-3.0-or-later +// +// Module: chdr_crossbar_nxn +// Description: +// This module implements a full-bandwidth NxN crossbar with N input and output ports +// for CHDR traffic. It supports multiple optimization strategies for performance, +// area and timing tradeoffs. It uses AXI-Stream for all of its links. The crossbar +// has a dynamic routing table based on a Content Addressable Memory (CAM). The SID +// is used to determine the destination of a packet and the routing table contains +// a re-programmable SID to crossbar port mapping. The table is programmed using +// special route config packets on the data input ports or using an optional +// management port. +// The topology, routing algorithms and the router architecture is +// described in README.md in this directory. +// Parameters: +// - CHDR_W: Width of the AXI-Stream data bus +// - NPORTS: Number of ports to instantiate +// - DEFAULT_PORT: The failsafe port to forward a packet to is SID mapping is missing +// - MTU: log2 of max packet size (in words) +// - ROUTE_TBL_SIZE: log2 of the number of mappings that the routing table can hold +// at any time. Mapping values are maintained in a FIFO fashion. +// - MUX_ALLOC: Algorithm to allocate the egress MUX +// * PRIO: Priority based. Lower port numbers have a higher priority +// * ROUND-ROBIN: Round robin input port allocation +// - OPTIMIZE: Optimization strategy for performance vs area vs timing tradeoffs +// * AREA: Attempt to minimize area at the cost of performance (throughput) and/or timing +// * PERFORMANCE: Attempt to maximize performance at the cost of area and/or timing +// * TIMING: Attempt to maximize Fmax at the cost of area and/or performance +// - NPORTS_MGMT: Number of ports with management endpoint. The first NPORTS_MGMT ports will +// have the management port instantiated +// - EXT_RTCFG_PORT: Enable a side-channel AXI-Stream management port to configure the +// routing table +// Signals: +// - s_axis_*: Slave port for router (flattened) +// - m_axis_*: Master port for router (flattened) +// - s_axis_mgmt_*: Management slave port +// - device_id: The ID of the device that has instantiated this module +// + +module chdr_crossbar_nxn #( + parameter [15:0] PROTOVER = {8'd1, 8'd0}, + parameter CHDR_W = 64, + parameter [7:0] NPORTS = 8, + parameter [7:0] DEFAULT_PORT = 0, + parameter MTU = 9, + parameter ROUTE_TBL_SIZE = 6, + parameter MUX_ALLOC = "ROUND-ROBIN", + parameter OPTIMIZE = "AREA", + parameter [7:0] NPORTS_MGMT = NPORTS, + parameter [0:0] EXT_RTCFG_PORT = 0 +) ( + input wire clk, + input wire reset, + // Device info + input wire [15:0] device_id, + // Inputs + input wire [(CHDR_W*NPORTS)-1:0] s_axis_tdata, + input wire [NPORTS-1:0] s_axis_tlast, + input wire [NPORTS-1:0] s_axis_tvalid, + output wire [NPORTS-1:0] s_axis_tready, + // Output + output wire [(CHDR_W*NPORTS)-1:0] m_axis_tdata, + output wire [NPORTS-1:0] m_axis_tlast, + output wire [NPORTS-1:0] m_axis_tvalid, + input wire [NPORTS-1:0] m_axis_tready, + // Router config management port + input wire ext_rtcfg_stb, + input wire [15:0] ext_rtcfg_addr, + input wire [31:0] ext_rtcfg_data, + output wire ext_rtcfg_ack +); + // --------------------------------------------------- + // RFNoC Includes + // --------------------------------------------------- + `include "../core/rfnoc_chdr_utils.vh" + `include "../core/rfnoc_chdr_internal_utils.vh" + + localparam NPORTS_W = $clog2(NPORTS); + localparam EPID_W = 16; + localparam [17:0] EXT_INFO = {1'b0, EXT_RTCFG_PORT, NPORTS_MGMT, NPORTS}; + + localparam [0:0] PKT_ST_HEAD = 1'b0; + localparam [0:0] PKT_ST_BODY = 1'b1; + + // The compute_mux_alloc function is the switch allocation function for the MUX + // i.e. it chooses which input port reserves the output MUX for packet transfer. + function [NPORTS_W-1:0] compute_mux_alloc; + input [NPORTS-1:0] pkt_waiting; + input [NPORTS_W-1:0] last_alloc; + reg signed [NPORTS_W:0] i; + begin + compute_mux_alloc = last_alloc; + for (i = NPORTS-1; i >= 0; i=i-1) begin + if (MUX_ALLOC == "PRIO") begin + // Priority. Lower port index gets a higher priority. + if (pkt_waiting[i]) + compute_mux_alloc = i; + end else begin + // Round-robin + if (pkt_waiting[(last_alloc + i + 1) % NPORTS]) + compute_mux_alloc = (last_alloc + i + 1) % NPORTS; + end + end + end + endfunction + + wire [NPORTS-1:0] rtcfg_req_wr; + wire [(16*NPORTS)-1:0] rtcfg_req_addr; + wire [(32*NPORTS)-1:0] rtcfg_req_data; + wire [NPORTS-1:0] rtcfg_resp_ack; + wire [(EPID_W*NPORTS)-1:0] find_tdata; + wire [NPORTS-1:0] find_tvalid; + wire [NPORTS-1:0] find_tready; + wire [(NPORTS_W*NPORTS)-1:0] result_tdata; + wire [NPORTS-1:0] result_tkeep; + wire [NPORTS-1:0] result_tvalid; + wire [NPORTS-1:0] result_tready; + + // Instantiate a single CAM-based routing table that will be shared between all + // input ports. Configuration and lookup is performed using an AXI-Stream iface. + // If multiple packets arrive simultaneously, only the headers of those packets will + // be serialized in order to arbitrate this map. Selection is done round-robin. + chdr_xb_routing_table #( + .SIZE(ROUTE_TBL_SIZE), .NPORTS(NPORTS), + .EXT_INS_PORT_EN(EXT_RTCFG_PORT) + ) routing_tbl_i ( + .clk (clk ), + .reset (reset ), + .port_req_wr (rtcfg_req_wr ), + .port_req_addr (rtcfg_req_addr), + .port_req_data (rtcfg_req_data), + .port_resp_ack (rtcfg_resp_ack), + .ext_req_wr (ext_rtcfg_stb ), + .ext_req_addr (ext_rtcfg_addr), + .ext_req_data (ext_rtcfg_data), + .ext_resp_ack (ext_rtcfg_ack ), + .axis_find_tdata (find_tdata ), + .axis_find_tvalid (find_tvalid ), + .axis_find_tready (find_tready ), + .axis_result_tdata (result_tdata ), + .axis_result_tkeep (result_tkeep ), + .axis_result_tvalid(result_tvalid ), + .axis_result_tready(result_tready ) + ); + + wire [CHDR_W-1:0] i_tdata [0:NPORTS-1]; + wire [9:0] i_tdest [0:NPORTS-1]; + wire [1:0] i_tid [0:NPORTS-1]; + wire i_tlast [0:NPORTS-1]; + wire i_tvalid [0:NPORTS-1]; + wire i_tready [0:NPORTS-1]; + wire [CHDR_W-1:0] buf_tdata [0:NPORTS-1]; + wire [NPORTS_W-1:0] buf_tdest [0:NPORTS-1], buf_tdest_tmp[0:NPORTS-1]; + wire buf_tkeep [0:NPORTS-1]; + wire buf_tlast [0:NPORTS-1]; + wire buf_tvalid[0:NPORTS-1]; + wire buf_tready[0:NPORTS-1]; + wire [CHDR_W-1:0] swi_tdata [0:NPORTS-1]; + wire [NPORTS_W-1:0] swi_tdest [0:NPORTS-1]; + wire swi_tlast [0:NPORTS-1]; + wire swi_tvalid[0:NPORTS-1]; + wire swi_tready[0:NPORTS-1]; + wire [(CHDR_W*NPORTS)-1:0] swo_tdata [0:NPORTS-1], muxi_tdata [0:NPORTS-1]; + wire [NPORTS-1:0] swo_tlast [0:NPORTS-1], muxi_tlast [0:NPORTS-1]; + wire [NPORTS-1:0] swo_tvalid[0:NPORTS-1], muxi_tvalid[0:NPORTS-1]; + wire [NPORTS-1:0] swo_tready[0:NPORTS-1], muxi_tready[0:NPORTS-1]; + + genvar n, i, j; + generate + for (n = 0; n < NPORTS; n = n + 1) begin: i_ports + // For each input port, first check if we have a management packet + // arriving. If it arrives, the top config commands are extrated, sent to the + // routing table for configuration, and the rest of the packet is forwarded + // down to the router. + // the router. + if (n < NPORTS_MGMT) begin + chdr_mgmt_pkt_handler #( + .PROTOVER(PROTOVER), .CHDR_W(CHDR_W), .MGMT_ONLY(0) + ) mgmt_ep_i ( + .clk (clk ), + .rst (reset ), + .node_info (chdr_mgmt_build_node_info(EXT_INFO, n, NODE_TYPE_XBAR, device_id)), + .s_axis_chdr_tdata (s_axis_tdata [(n*CHDR_W)+:CHDR_W] ), + .s_axis_chdr_tlast (s_axis_tlast [n] ), + .s_axis_chdr_tvalid (s_axis_tvalid[n] ), + .s_axis_chdr_tready (s_axis_tready[n] ), + .s_axis_chdr_tuser ('d0 ), + .m_axis_chdr_tdata (i_tdata [n] ), + .m_axis_chdr_tdest (i_tdest [n] ), + .m_axis_chdr_tid (i_tid [n] ), + .m_axis_chdr_tlast (i_tlast [n] ), + .m_axis_chdr_tvalid (i_tvalid [n] ), + .m_axis_chdr_tready (i_tready [n] ), + .ctrlport_req_wr (rtcfg_req_wr [n] ), + .ctrlport_req_rd (/* unused */ ), + .ctrlport_req_addr (rtcfg_req_addr[(n*16)+:16] ), + .ctrlport_req_data (rtcfg_req_data[(n*32)+:32] ), + .ctrlport_resp_ack (rtcfg_resp_ack[n] ), + .ctrlport_resp_data (32'h0 /* unused */ ), + .op_stb (/* unused */ ), + .op_dst_epid (/* unused */ ), + .op_src_epid (/* unused */ ), + .op_data (/* unused */ ) + ); + end else begin + assign i_tdata [n] = s_axis_tdata [(n*CHDR_W)+:CHDR_W]; + assign i_tid [n] = CHDR_MGMT_ROUTE_EPID; + assign i_tdest [n] = 10'd0; // Unused + assign i_tlast [n] = s_axis_tlast [n]; + assign i_tvalid [n] = s_axis_tvalid[n]; + assign s_axis_tready[n] = i_tready [n]; + + assign rtcfg_req_wr [n] = 1'b0; + assign rtcfg_req_addr[(n*16)+:16] = 16'h0; + assign rtcfg_req_data[(n*32)+:32] = 32'h0; + end + + // Ingress buffer module that does the following: + // - Stores and gates an incoming packet + // - Looks up destination in routing table and attaches a tdest for the packet + chdr_xb_ingress_buff #( + .WIDTH(CHDR_W), .MTU(MTU), .DEST_W(NPORTS_W), .NODE_ID(n) + ) buf_i ( + .clk (clk ), + .reset (reset ), + .s_axis_chdr_tdata (i_tdata [n] ), + .s_axis_chdr_tdest (i_tdest [n][NPORTS_W-1:0] ), + .s_axis_chdr_tid (i_tid [n] ), + .s_axis_chdr_tlast (i_tlast [n] ), + .s_axis_chdr_tvalid (i_tvalid [n] ), + .s_axis_chdr_tready (i_tready [n] ), + .m_axis_chdr_tdata (buf_tdata [n] ), + .m_axis_chdr_tdest (buf_tdest_tmp[n] ), + .m_axis_chdr_tkeep (buf_tkeep [n] ), + .m_axis_chdr_tlast (buf_tlast [n] ), + .m_axis_chdr_tvalid (buf_tvalid [n] ), + .m_axis_chdr_tready (buf_tready [n] ), + .m_axis_find_tdata (find_tdata [(n*EPID_W)+:EPID_W] ), + .m_axis_find_tvalid (find_tvalid [n] ), + .m_axis_find_tready (find_tready [n] ), + .s_axis_result_tdata (result_tdata [(n*NPORTS_W)+:NPORTS_W]), + .s_axis_result_tkeep (result_tkeep [n] ), + .s_axis_result_tvalid(result_tvalid[n] ), + .s_axis_result_tready(result_tready[n] ) + ); + assign buf_tdest[n] = buf_tkeep[n] ? buf_tdest_tmp[n] : DEFAULT_PORT[NPORTS_W-1:0]; + + // Pipeline state + axi_fifo #( + .WIDTH(CHDR_W+1+NPORTS_W), .SIZE(1) + ) pipe_i ( + .clk (clk ), + .reset (reset ), + .clear (1'b0 ), + .i_tdata ({buf_tlast[n], buf_tdest[n], buf_tdata[n]}), + .i_tvalid (buf_tvalid[n] ), + .i_tready (buf_tready[n] ), + .o_tdata ({swi_tlast[n], swi_tdest[n], swi_tdata[n]}), + .o_tvalid (swi_tvalid[n] ), + .o_tready (swi_tready[n] ), + .space (/* Unused */ ), + .occupied (/* Unused */ ) + ); + + // Ingress demux. Use the tdest field to determine packet destination + axis_switch #( + .DATA_W(CHDR_W), .DEST_W(1), .IN_PORTS(1), .OUT_PORTS(NPORTS), .PIPELINE(1) + ) demux_i ( + .clk (clk ), + .reset (reset ), + .s_axis_tdata (swi_tdata [n] ), + .s_axis_tdest ({1'b0, swi_tdest [n]}), + .s_axis_tlast (swi_tlast [n] ), + .s_axis_tvalid (swi_tvalid[n] ), + .s_axis_tready (swi_tready[n] ), + .s_axis_alloc (1'b0 ), + .m_axis_tdata (swo_tdata [n] ), + .m_axis_tdest (/* Unused */ ), + .m_axis_tlast (swo_tlast [n] ), + .m_axis_tvalid (swo_tvalid[n] ), + .m_axis_tready (swo_tready[n] ) + ); + end + + for (i = 0; i < NPORTS; i = i + 1) begin + for (j = 0; j < NPORTS; j = j + 1) begin + assign muxi_tdata [i][j*CHDR_W+:CHDR_W] = swo_tdata [j][i*CHDR_W+:CHDR_W]; + assign muxi_tlast [i][j] = swo_tlast [j][i]; + assign muxi_tvalid[i][j] = swo_tvalid [j][i]; + assign swo_tready [i][j] = muxi_tready[j][i]; + end + end + + for (n = 0; n < NPORTS; n = n + 1) begin: o_ports + if (OPTIMIZE == "PERFORMANCE") begin + // Use the axis_switch module when optimizing for performance + // This logic has some extra levels of logic to ensure + // that the switch allocation happens in 0 clock cycles which + // means that Fmax for this implementation will be lower. + + wire mux_ready = |muxi_tready[n]; // Max 1 bit should be high + wire mux_valid = |muxi_tvalid[n]; + wire mux_last = |(muxi_tvalid[n] & muxi_tlast[n]); + + // Track the input packet state + reg [0:0] pkt_state = PKT_ST_HEAD; + always @(posedge clk) begin + if (reset) begin + pkt_state <= PKT_ST_HEAD; + end else if (mux_valid & mux_ready) begin + pkt_state <= mux_last ? PKT_ST_HEAD : PKT_ST_BODY; + end + end + + // The switch requires the allocation to stay valid until the + // end of the packet. We also might need to keep the previous + // packet's allocation to compute the current one + reg [NPORTS_W-1:0] prev_sw_alloc = {NPORTS_W{1'b0}}; + reg [NPORTS_W-1:0] pkt_sw_alloc = {NPORTS_W{1'b0}}; + wire [NPORTS_W-1:0] muxi_sw_alloc = (mux_valid && pkt_state == PKT_ST_HEAD) ? + compute_mux_alloc(muxi_tvalid[n], prev_sw_alloc) : pkt_sw_alloc; + + always @(posedge clk) begin + if (reset) begin + prev_sw_alloc <= {NPORTS_W{1'b0}}; + pkt_sw_alloc <= {NPORTS_W{1'b0}}; + end else if (mux_valid & mux_ready) begin + if (pkt_state == PKT_ST_HEAD) + pkt_sw_alloc <= muxi_sw_alloc; + if (mux_last) + prev_sw_alloc <= muxi_sw_alloc; + end + end + + axis_switch #( + .DATA_W(CHDR_W), .DEST_W(1), .IN_PORTS(NPORTS), .OUT_PORTS(1), + .PIPELINE(0) + ) mux_i ( + .clk (clk ), + .reset (reset ), + .s_axis_tdata (muxi_tdata [n] ), + .s_axis_tdest ({NPORTS{1'b0}} /* Unused */ ), + .s_axis_tlast (muxi_tlast [n] ), + .s_axis_tvalid (muxi_tvalid[n] ), + .s_axis_tready (muxi_tready[n] ), + .s_axis_alloc (muxi_sw_alloc ), + .m_axis_tdata (m_axis_tdata [(n*CHDR_W)+:CHDR_W]), + .m_axis_tdest (/* Unused */ ), + .m_axis_tlast (m_axis_tlast [n] ), + .m_axis_tvalid (m_axis_tvalid[n] ), + .m_axis_tready (m_axis_tready[n] ) + ); + end else begin + // axi_mux has an additional bubble cycle but the logic + // to allocate an input port has fewer levels and takes + // up fewer resources. + axi_mux #( + .PRIO(MUX_ALLOC == "PRIO"), .WIDTH(CHDR_W), .SIZE(NPORTS), + .PRE_FIFO_SIZE(OPTIMIZE == "TIMING" ? 1 : 0), .POST_FIFO_SIZE(1) + ) mux_i ( + .clk (clk ), + .reset (reset ), + .clear (1'b0 ), + .i_tdata (muxi_tdata [n] ), + .i_tlast (muxi_tlast [n] ), + .i_tvalid (muxi_tvalid [n] ), + .i_tready (muxi_tready [n] ), + .o_tdata (m_axis_tdata [(n*CHDR_W)+:CHDR_W]), + .o_tlast (m_axis_tlast [n] ), + .o_tvalid (m_axis_tvalid[n] ), + .o_tready (m_axis_tready[n] ) + ); + end + end + endgenerate + + +endmodule |
