//# A Parallel Binary Multiplier
// A generic signed multiplier module, with the inference left to the CAD
// tool. Any attributes to control the synthesis of the multiplier should be
// applied, in the text of the enclosing module, to this whole module. The
// attributes vary too much and none provide the default automatic inference
// choices made by the CAD tool (e.g.: logic for narrow widths, DSP blocks for
// larger widths), which are almost always the right choices.
//## Width and Signedness
// Both input word widths are parameterized separately so as to infer the
// smallest multiplier necessary to generate a full product, whose width is
// the sum of the widths of the inputs. *The user must manually supply that
// total width to the connecting wires in the enclosing module, as there is no
// way to introspect parameter values inside a module.*
// The multiplier inference is limited to signed integers as the common case
// and to keep the code simple. If you must treat the inputs as unsigned
// integers, first extend them with a constant zero most-significant bit to
// force them positive. Yes, this may cost some area and may slow down the
// logic slightly, but if a single extra bit of width breaks your timing
// closure, then your design's timing closure was already on its last legs,
// and perhaps you need to allow more pipelining and/or use a narrower width.
//## Pipelining
// At the time of writing (March 2020), retiming of external registers into an
// inferred multiplier does not work in Vivado, so we cannot simply place
// a [Register Pipeline](./Register_Pipeline.html) before and after the
// multiplier module to help it meet timing if necessary.
// Instead, we must connect the input and output pipelines and the multiplier
// all together in a single clocked always block to match the recommended HDL
// style for multiplier inference (UG901, *Vivado Design Suite User Guide:
// Synthesis*). This code also works under Intel Quartus Prime as its HDL
// coding guidelines for inferring multipliers are the same (UG-20131, *Intel
// Quartus Prime Pro Edition User Guide: Design Recommendations*).
// **Note**: *you must disable shift register extraction during synthesis to
// force implementation of the pipelines as registers which can be retimed and
// can be placed-and-routed independently.* Else your pipeline may be
// implemented as shift registers which, while compact, won't provide any
// pipelining benefits. *Shift register extraction is enabled by default in
// Vivado and Quartus synthesis.*
// Pipelining multipliers, although optional, benefits both synthesis and
// place-and-route:
// * At a minimum, single input and output pipeline registers will get packed
// into the input and output registers of the DSP blocks, easing timing and
// saving area.
// * Particularly for wider multipliers, the inferred adder tree which merges
// the partial products from multiple DSP blocks may need an output pipeline
// to meet timing. (The input pipeline does not appear to retime through the
// DSP blocks.)
// * Finally, since DSP blocks only exist in certain fixed locations on FPGAs,
// abundant pipelining frees the CAD tool to place-and-route the DSP blocks
// away from other timing-critical logic, which would otherwise misleadinly
// appear to be the critical path (due to poor routing) and lead you to waste
// effort trying to optimize the wrong logic!
// For example, I've once had to add a total of 24 pipeline stages (12 input,
// 12 output) to a very wide multiplier with a 100-bit product to allow other
// 100-bit arithmetic logic to meet timing.
`default_nettype none
module Multiplier_Binary_Parallel
#(
parameter WORD_WIDTH_A = 0,
parameter WORD_WIDTH_B = 0,
parameter INPUT_PIPE_DEPTH = 0,
parameter OUTPUT_PIPE_DEPTH = 0,
// Don't set at instantiation, except in IPI
parameter PRODUCT_WIDTH = WORD_WIDTH_A + WORD_WIDTH_B
)
(
// Unused if no input/output pipelines (combinational multiplier)
// verilator lint_off UNUSED
input wire clock,
// verilator lint_on UNUSED
input wire signed [WORD_WIDTH_A-1:0] A_in,
input wire signed [WORD_WIDTH_B-1:0] B_in,
output reg signed [PRODUCT_WIDTH-1:0] product_out
);
localparam WORD_ZERO_A = {WORD_WIDTH_A{1'b0}};
localparam WORD_ZERO_B = {WORD_WIDTH_B{1'b0}};
localparam PRODUCT_ZERO = {PRODUCT_WIDTH{1'b0}};
initial begin
product_out = PRODUCT_ZERO;
end
// If their depth is greater than zero, create the input and/or output
// pipelines. These **MUST** be declared as `signed`, else the multiplication
// will be inferred as unsigned and calculate the wrong results when given
// negative integers.
// Then, if necessary, we connect the inputs, the pipelines, and the
// multiplier all together in a single clocked always block. The CAD tool with
// infer DSP blocks and adders, and retime the pipeline registers as necessary
// if retiming is enabled in your CAD tool. *Retiming is off by default in
// Vivado and Quartus synthesis.*
// We write the pipelines using the idiom of peeling out the first loop
// iteration so we never generate a negative index with `i-1`. Note the
// initialization value of `i` in the for-loops.
// There are four possible cases of zero and non-zero (positive) pipeline
// depths, so we generate the correct code, based on the HDL guidelines, for
// each case. A negative pipe depth would result in negative array ranges,
// *which is legal in Verilog-2001*, though I have no idea what it's for. To
// avoid strange corner cases, no code is generated for negative values, which
// will cause the design to fail to elaborate.
generate
// No pipelines (combinational multiplier)
if ((INPUT_PIPE_DEPTH == 0) && (OUTPUT_PIPE_DEPTH == 0)) begin: no_pipe
always @(*) begin
product_out = A_in * B_in;
end
end
// Input pipeline only
else if ((INPUT_PIPE_DEPTH > 0) && (OUTPUT_PIPE_DEPTH == 0)) begin: in_pipe
reg signed [WORD_WIDTH_A-1:0] input_pipe_A [INPUT_PIPE_DEPTH-1:0];
reg signed [WORD_WIDTH_B-1:0] input_pipe_B [INPUT_PIPE_DEPTH-1:0];
integer i;
initial begin
for (i=0; i < INPUT_PIPE_DEPTH; i=i+1) begin
input_pipe_A [i] = WORD_ZERO_A;
input_pipe_B [i] = WORD_ZERO_B;
end
end
always @(posedge clock) begin
input_pipe_A[0] <= A_in;
input_pipe_B[0] <= B_in;
for (i=1; i < INPUT_PIPE_DEPTH; i=i+1) begin: per_input
input_pipe_A [i] <= input_pipe_A [i-1];
input_pipe_B [i] <= input_pipe_B [i-1];
end
end
// Corner case: it isn't possible to put this in the clocked
// always block without registering the output as a consequence.
always @(*) begin
product_out = input_pipe_A [INPUT_PIPE_DEPTH-1] * input_pipe_B [INPUT_PIPE_DEPTH-1];
end
end
// Output pipeline only
else if ((INPUT_PIPE_DEPTH == 0) && (OUTPUT_PIPE_DEPTH > 0)) begin: out_pipe
reg signed [PRODUCT_WIDTH-1:0] output_pipe [OUTPUT_PIPE_DEPTH-1:0];
integer i;
initial begin
for (i=0; i < OUTPUT_PIPE_DEPTH; i=i+1) begin
output_pipe [i] = PRODUCT_ZERO;
end
end
always @(posedge clock) begin
output_pipe [0] <= A_in * B_in;
for (i=1; i < OUTPUT_PIPE_DEPTH; i=i+1) begin: per_output
output_pipe [i] <= output_pipe [i-1];
end
end
always @(*) begin
product_out = output_pipe [OUTPUT_PIPE_DEPTH-1];
end
end
// Both input and output pipelines
else if ((INPUT_PIPE_DEPTH > 0) && (OUTPUT_PIPE_DEPTH > 0)) begin: in_out_pipe
reg signed [WORD_WIDTH_A-1:0] input_pipe_A [INPUT_PIPE_DEPTH-1:0];
reg signed [WORD_WIDTH_B-1:0] input_pipe_B [INPUT_PIPE_DEPTH-1:0];
reg signed [PRODUCT_WIDTH-1:0] output_pipe [OUTPUT_PIPE_DEPTH-1:0];
integer i;
initial begin
for (i=0; i < INPUT_PIPE_DEPTH; i=i+1) begin
input_pipe_A [i] = WORD_ZERO_A;
input_pipe_B [i] = WORD_ZERO_B;
end
for (i=0; i < OUTPUT_PIPE_DEPTH; i=i+1) begin
output_pipe [i] = PRODUCT_ZERO;
end
end
always @(posedge clock) begin
input_pipe_A[0] <= A_in;
input_pipe_B[0] <= B_in;
for (i=1; i < INPUT_PIPE_DEPTH; i=i+1) begin: per_input
input_pipe_A [i] <= input_pipe_A [i-1];
input_pipe_B [i] <= input_pipe_B [i-1];
end
output_pipe [0] <= input_pipe_A [INPUT_PIPE_DEPTH-1] * input_pipe_B [INPUT_PIPE_DEPTH-1];
for (i=1; i < OUTPUT_PIPE_DEPTH; i=i+1) begin: per_output
output_pipe [i] <= output_pipe [i-1];
end
end
always @(*) begin
product_out = output_pipe [OUTPUT_PIPE_DEPTH-1];
end
end
endgenerate
endmodule