/* * SPDX-License-Identifier: Apache-2.0 */ #include "onnx/defs/math/utils.h" #include namespace ONNX_NAMESPACE { namespace defs { namespace math { namespace utils { static const char* TopK_ver11_doc = R"DOC( Retrieve the top-K largest or smallest elements along a specified axis. Given an input tensor of shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs: * Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which contains the values of the top k elements along the specified axis * Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which contains the indices of the top k elements (original indices from the input tensor). * If "largest" is 1 (the default value) then the k largest elements are returned. * If "sorted" is 1 (the default value) then the resulting k elements will be sorted. * If "sorted" is 0, order of returned 'Values' and 'Indices' are undefined. Given two equivalent values, this operator uses the indices along the axis as a tiebreaker. That is, the element with the lower index will appear first. )DOC"; std::function TopKOpGenerator(const std::vector& allowed_types) { return [=](OpSchema& schema) { schema.SetDoc(TopK_ver11_doc) .Input( 0, "X", "Tensor of shape [a_0, a_1, ..., a_{n-1}]", "T", OpSchema::Single, true, 1, OpSchema::Differentiable) .Input( 1, "K", "A 1-D tensor containing a single positive value corresponding to the number of top elements to retrieve", "tensor(int64)", OpSchema::Single, true, 1, OpSchema::NonDifferentiable) .Output( 0, "Values", "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] " "containing top K values from the input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable) .Output( 1, "Indices", "Tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] " "containing the corresponding input tensor indices for the top K " "values.", "I", OpSchema::Single, true, 1, OpSchema::NonDifferentiable) .TypeConstraint("T", allowed_types, "Constrain input and output types to numeric tensors.") .TypeConstraint("I", {"tensor(int64)"}, "Constrain index tensor to int64") .Attr( "axis", "Dimension on which to do the sort. Negative value means counting dimensions " "from the back. Accepted range is [-r, r-1] where r = rank(input).", AttributeProto::INT, static_cast(-1)) .Attr( "largest", "Whether to return the top-K largest or smallest elements.", AttributeProto::INT, static_cast(1)) .Attr("sorted", "Whether to return the elements in sorted order.", AttributeProto::INT, static_cast(1)) .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { // Type inference: propagateElemTypeFromInputToOutput(ctx, 0, 0); updateOutputElemType(ctx, 1, TensorProto::INT64); // Shape inference: if (!hasInputShape(ctx, 0)) return; auto& input_shape = getInputShape(ctx, 0); int64_t rank = input_shape.dim_size(); int64_t axis = getAttribute(ctx, "axis", -1); if (axis < 0) axis += rank; if (axis < 0 || axis >= rank) { fail_shape_inference("Invalid value for attribute axis"); } const auto& axis_dim = input_shape.dim(static_cast(axis)); const auto* k = ctx.getInputData(1); // Infer output shape if: // (1) 'K' is available // (2) axis_dim has dim value // Otherwise cannot reliably compute output shape as axis dim value is // unknown and hence cannot determine if axis dim value >= k (which // should be enforced) if (nullptr != k && axis_dim.has_dim_value()) { int64_t k_value = 0; if (k->dims_size() != 1 || k->dims(0) != 1) { fail_shape_inference("K input must be a one-dimensional tensor of size 1."); } if (k->data_type() == TensorProto::INT64) { const auto data = ParseData(k); k_value = data[0]; } else { fail_shape_inference("K input must be of type int64."); } if (axis_dim.dim_value() < k_value) { fail_shape_inference("Axis has less than the requested k elements."); } TensorShapeProto result_shape = input_shape; result_shape.mutable_dim(static_cast(axis))->set_dim_value(k_value); updateOutputShape(ctx, 0, result_shape); updateOutputShape(ctx, 1, result_shape); return; } // Infer output shapes' rank in any case auto* output_shape_0 = getOutputShape(ctx, 0); auto* output_shape_1 = getOutputShape(ctx, 1); for (int i = 0; i < input_shape.dim_size(); ++i) { output_shape_0->add_dim(); output_shape_1->add_dim(); } return; }); }; } int MathOpTwoIntegers(const std::string& op_type, int a, int b) { if (op_type == "Add") { return a + b; } else if (op_type == "Sub") { return a - b; } else if (op_type == "Mul") { return a * b; } fail_shape_inference("Wrong op_type name for running propagation: ", op_type); } void MatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int input1Idx, int input2Idx) { if (!hasInputShape(ctx, input1Idx) || !hasInputShape(ctx, input2Idx)) { return; } const auto shape0 = ctx.getInputType(input1Idx)->tensor_type().shape(); const auto shape1 = ctx.getInputType(input2Idx)->tensor_type().shape(); if (shape0.dim_size() == 0 || shape1.dim_size() == 0) { fail_shape_inference("Input tensors of wrong rank (0)."); } ONNX_NAMESPACE::TensorShapeProto shapeL, shapeR; // First promote each shape to at least rank-2. This logic is // specific to matmul, not generic broadcasting. { if (shape0.dim_size() == 1) { shapeL.add_dim()->set_dim_value(1); *shapeL.add_dim() = shape0.dim(0); } else { *shapeL.mutable_dim() = shape0.dim(); } if (shape1.dim_size() == 1) { *shapeR.add_dim() = shape1.dim(0); shapeR.add_dim()->set_dim_value(1); } else { *shapeR.mutable_dim() = shape1.dim(); } } // Check for compatible matrix multiply dimensions { const auto& dimL = shapeL.dim(shapeL.dim_size() - 1); const auto& dimR = shapeR.dim(shapeR.dim_size() - 2); if (dimL.has_dim_value() && dimR.has_dim_value() && dimL.dim_value() != dimR.dim_value()) { fail_shape_inference("Incompatible dimensions for matrix multiplication"); } } ONNX_NAMESPACE::TensorShapeProto resultShape; // Now call out to generic multidimensional broadcasting for // the broadcastable prefixes. { ONNX_NAMESPACE::TensorShapeProto prefixShapeL, prefixShapeR; for (int i = 0; i < shapeL.dim_size() - 2; ++i) { *prefixShapeL.add_dim() = shapeL.dim(i); } for (int i = 0; i < shapeR.dim_size() - 2; ++i) { *prefixShapeR.add_dim() = shapeR.dim(i); } bidirectionalBroadcastShapeInference(prefixShapeL, prefixShapeR, resultShape); } // Back to matmul-specific. Add the trailing dimensions back in. { if (shape0.dim_size() != 1) { *resultShape.add_dim() = shapeL.dim(shapeL.dim_size() - 2); } if (shape1.dim_size() != 1) { *resultShape.add_dim() = shapeR.dim(shapeR.dim_size() - 1); } } *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape() = resultShape; } void QLinearMatMulShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) { auto a_type = ctx.getInputType(0); auto b_type = ctx.getInputType(3); if (nullptr == a_type || nullptr == b_type || a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType || b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) { fail_type_inference("inputs are expected to have tensor type."); } auto a_zero_point_type = ctx.getInputType(2); if (nullptr == a_zero_point_type || a_zero_point_type->tensor_type().elem_type() != a_type->tensor_type().elem_type()) { fail_type_inference("input and zero_point pair is expected to have be same type."); } auto b_zero_point_type = ctx.getInputType(5); if (nullptr == b_zero_point_type || b_zero_point_type->tensor_type().elem_type() != b_type->tensor_type().elem_type()) { fail_type_inference("input and zero_point pair is expected to have same type."); } propagateElemTypeFromInputToOutput(ctx, 7, 0); MatMulShapeInference(ctx, 0, 3); } const char* QLinearMatMulDoc() { static const char* QLinearMatMul_doc = R"DOC( Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html). It consumes two quantized input tensors, their scales and zero points, scale and zero point of output, and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point). For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details. Scale and zero point must have same shape. They must be either scalar (per tensor) or N-D tensor (per row for 'a' and per column for 'b'). Scalar refers to per tensor quantization whereas N-D refers to per row or per column quantization. If the input is 2D of shape [M, K] then zero point and scale tensor may be an M element vector [v_1, v_2, ..., v_M] for per row quantization and K element vector of shape [v_1, v_2, ..., v_K] for per column quantization. If the input is N-D tensor with shape [D1, D2, M, K] then zero point and scale tensor may have shape [D1, D2, M, 1] for per row quantization and shape [D1, D2, 1, K] for per column quantization. Production must never overflow, and accumulation may overflow if and only if in 32 bits. )DOC"; return QLinearMatMul_doc; } } // namespace utils } // namespace math } // namespace defs } // namespace ONNX_NAMESPACE