477 lines
17 KiB
C
477 lines
17 KiB
C
#include "cuda.h"
|
|
#include <dlfcn.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#define PY_SSIZE_T_CLEAN
|
|
#include <Python.h>
|
|
|
|
typedef struct {
|
|
PyObject_HEAD;
|
|
_Alignas(128) CUtensorMap tensorMap;
|
|
} PyCUtensorMapObject;
|
|
|
|
// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
|
|
static bool gpuAssert(CUresult code, const char *file, int line) {
|
|
if (code == CUDA_SUCCESS)
|
|
return true;
|
|
|
|
const char *prefix = "Triton Error [CUDA]: ";
|
|
const char *str;
|
|
cuGetErrorString(code, &str);
|
|
char err[1024] = {0};
|
|
strcat(err, prefix);
|
|
strcat(err, str);
|
|
PyGILState_STATE gil_state;
|
|
gil_state = PyGILState_Ensure();
|
|
PyErr_SetString(PyExc_RuntimeError, err);
|
|
PyGILState_Release(gil_state);
|
|
return false;
|
|
}
|
|
|
|
// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
|
|
#define CUDA_CHECK_AND_RETURN_NULL(ans) \
|
|
do { \
|
|
if (!gpuAssert((ans), __FILE__, __LINE__)) \
|
|
goto cleanup; \
|
|
} while (0)
|
|
|
|
// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
|
|
#define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \
|
|
do { \
|
|
if (!gpuAssert((ans), __FILE__, __LINE__)) { \
|
|
PyEval_RestoreThread(_save); \
|
|
return NULL; \
|
|
} \
|
|
} while (0)
|
|
|
|
// Used to check if functions exist in old CUDA driver versions.
|
|
#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \
|
|
do { \
|
|
if ((funcPointer) == NULL) { \
|
|
(funcPointer) = (initializerFunction)(); \
|
|
if ((funcPointer) == NULL) { \
|
|
goto cleanup; \
|
|
} \
|
|
} \
|
|
} while (0)
|
|
|
|
static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
|
|
int device_id;
|
|
if (!PyArg_ParseTuple(args, "i", &device_id))
|
|
return NULL;
|
|
// Get device handle
|
|
CUdevice device;
|
|
cuDeviceGet(&device, device_id);
|
|
|
|
// create a struct to hold device properties
|
|
int max_shared_mem;
|
|
int max_num_regs;
|
|
int multiprocessor_count;
|
|
int warp_size;
|
|
int sm_clock_rate;
|
|
int mem_clock_rate;
|
|
int mem_bus_width;
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
|
|
device));
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
|
|
CUDA_CHECK_AND_RETURN_NULL(
|
|
cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
|
|
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
|
|
&mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
|
|
|
|
return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
|
|
max_shared_mem, "max_num_regs", max_num_regs,
|
|
"multiprocessor_count", multiprocessor_count, "warpSize",
|
|
warp_size, "sm_clock_rate", sm_clock_rate,
|
|
"mem_clock_rate", mem_clock_rate, "mem_bus_width",
|
|
mem_bus_width);
|
|
|
|
cleanup:
|
|
return NULL;
|
|
}
|
|
|
|
static PyObject *loadBinary(PyObject *self, PyObject *args) {
|
|
const char *name;
|
|
const char *data;
|
|
Py_ssize_t data_size;
|
|
int shared;
|
|
int device;
|
|
if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
|
|
&device)) {
|
|
return NULL;
|
|
}
|
|
CUfunction fun;
|
|
CUmodule mod;
|
|
int32_t n_regs = 0;
|
|
int32_t n_spills = 0;
|
|
int32_t n_max_threads = 0;
|
|
// create driver handles
|
|
CUcontext pctx = 0;
|
|
|
|
Py_BEGIN_ALLOW_THREADS;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx));
|
|
if (!pctx) {
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuDevicePrimaryCtxRetain(&pctx, device));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx));
|
|
}
|
|
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuModuleGetFunction(&fun, mod, name));
|
|
// get allocated registers and spilled registers from the function
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
|
|
n_spills /= 4;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
|
|
&n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
|
|
// set dynamic shared memory if necessary
|
|
int shared_optin;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
|
|
&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
|
|
device));
|
|
if (shared > 49152 && shared_optin > 49152) {
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
|
|
int shared_total, shared_static;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
|
|
&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
|
|
device));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
|
|
&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
|
|
shared_optin - shared_static));
|
|
}
|
|
Py_END_ALLOW_THREADS;
|
|
|
|
if (PyErr_Occurred()) {
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
|
|
n_spills, n_max_threads);
|
|
}
|
|
|
|
typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
|
|
int *numClusters, CUfunction func, const CUlaunchConfig *config);
|
|
|
|
typedef CUresult (*cuTensorMapEncodeTiled_t)(
|
|
CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
|
|
cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
|
|
const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
|
|
const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
|
|
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
|
|
CUtensorMapFloatOOBfill oobFill);
|
|
|
|
#define defineGetFunctionHandle(name, symbolName) \
|
|
static symbolName##_t name() { \
|
|
/* Open the shared library */ \
|
|
void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY); \
|
|
if (!libHandle) { \
|
|
PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); \
|
|
return NULL; \
|
|
} \
|
|
/* Clear any existing error */ \
|
|
dlerror(); \
|
|
symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \
|
|
/* Check for errors */ \
|
|
const char *err = dlerror(); \
|
|
if (err) { \
|
|
PyErr_SetString(PyExc_RuntimeError, \
|
|
"Failed to retrieve " #symbolName " from libcuda.so.1"); \
|
|
dlclose(libHandle); \
|
|
return NULL; \
|
|
} \
|
|
return funcHandle; \
|
|
}
|
|
|
|
defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
|
|
cuOccupancyMaxActiveClusters);
|
|
|
|
defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
|
|
cuTensorMapEncodeTiled);
|
|
|
|
static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
|
|
int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1,
|
|
maxActiveClusters = -1;
|
|
int shared = 0;
|
|
CUfunction func;
|
|
|
|
if (!PyArg_ParseTuple(args, "Kiiii", &func, &shared, &clusterDimX,
|
|
&clusterDimY, &clusterDimZ)) {
|
|
return NULL;
|
|
}
|
|
|
|
// Let each SM have one block
|
|
int maxActiveBlocks = 1;
|
|
Py_BEGIN_ALLOW_THREADS;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
|
|
func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared));
|
|
Py_END_ALLOW_THREADS;
|
|
|
|
CUlaunchAttribute launchAttr[1];
|
|
launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
|
|
launchAttr[0].value.clusterDim.x = clusterDimX;
|
|
launchAttr[0].value.clusterDim.y = clusterDimY;
|
|
launchAttr[0].value.clusterDim.z = clusterDimZ;
|
|
CUlaunchConfig config;
|
|
config.gridDimX = clusterDimX;
|
|
config.gridDimY = maxActiveBlocks * clusterDimY;
|
|
config.gridDimZ = clusterDimZ;
|
|
config.blockDimX = 128;
|
|
config.blockDimY = 1;
|
|
config.blockDimZ = 1;
|
|
config.sharedMemBytes = shared;
|
|
config.hStream = 0;
|
|
config.numAttrs = 1;
|
|
config.attrs = launchAttr;
|
|
|
|
static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
|
|
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
|
|
getCuOccupancyMaxActiveClustersHandle);
|
|
|
|
Py_BEGIN_ALLOW_THREADS;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
|
|
func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
|
|
Py_END_ALLOW_THREADS;
|
|
return PyLong_FromLong(maxActiveClusters);
|
|
|
|
cleanup:
|
|
return NULL;
|
|
}
|
|
|
|
static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
|
|
long size;
|
|
if (!PyArg_ParseTuple(args, "l", &size)) {
|
|
return NULL;
|
|
}
|
|
if (size < 0) {
|
|
PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
|
|
return NULL;
|
|
}
|
|
|
|
Py_BEGIN_ALLOW_THREADS;
|
|
|
|
// Ensure we have an active context.
|
|
CUcontext ctx = NULL;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx));
|
|
if (!ctx) {
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuDevicePrimaryCtxRetain(&ctx, /*device=*/0));
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx));
|
|
}
|
|
|
|
// We can't set the fifo size after running a kernel that calls printf. This
|
|
// is true even if the set() call is a nop and the new size is the same as the
|
|
// old size.
|
|
//
|
|
// This is unfriendly, so check if the old size matches the new size, and skip
|
|
// the set() call if so.
|
|
size_t oldSize = 0;
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
|
|
if (oldSize != size) {
|
|
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
|
|
cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
|
|
}
|
|
|
|
Py_END_ALLOW_THREADS;
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
|
|
PyCUtensorMapObject *self = NULL;
|
|
void *mem = NULL;
|
|
size_t size = type->tp_basicsize;
|
|
|
|
if (posix_memalign(&mem, 128, size) != 0) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
|
|
self = (PyCUtensorMapObject *)mem;
|
|
PyObject_INIT(self, type);
|
|
return (PyObject *)self;
|
|
}
|
|
|
|
static void PyCUtensorMap_dealloc(PyObject *self) {
|
|
Py_TYPE(self)->tp_free(self);
|
|
}
|
|
|
|
static void PyCUtensorMap_free(void *ptr) { free(ptr); }
|
|
|
|
// clang-format off
|
|
static PyTypeObject PyCUtensorMapType = {
|
|
PyVarObject_HEAD_INIT(NULL, 0)
|
|
.tp_name = "triton.backends.nvidia.PyCUtensorMap",
|
|
.tp_basicsize = sizeof(PyCUtensorMapObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT,
|
|
.tp_doc = "<PyCUtensorMap object>",
|
|
.tp_new = PyType_GenericNew,
|
|
.tp_alloc = PyCUtensorMap_alloc,
|
|
.tp_dealloc = (destructor)PyCUtensorMap_dealloc,
|
|
.tp_free = PyCUtensorMap_free,
|
|
};
|
|
// clang-format on
|
|
|
|
static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
|
|
unsigned long long global_address;
|
|
int swizzle;
|
|
int elemSize;
|
|
int elemType;
|
|
PyObject *blockSize;
|
|
PyObject *shape;
|
|
PyObject *strides;
|
|
int padding;
|
|
|
|
if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
|
|
&elemType, &blockSize, &shape, &strides, &padding)) {
|
|
return NULL;
|
|
}
|
|
|
|
PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
|
|
(PyObject *)&PyCUtensorMapType, NULL);
|
|
if (!desc) {
|
|
return NULL;
|
|
}
|
|
|
|
PyObject *blockSizeFast = NULL;
|
|
PyObject *shapeFast = NULL;
|
|
PyObject *stridesFast = NULL;
|
|
|
|
uint32_t blockSizeInt[5];
|
|
uint64_t shapeInt[5];
|
|
uint64_t stridesLL[5];
|
|
|
|
blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
|
|
if (!blockSizeFast)
|
|
goto cleanup;
|
|
int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
|
|
|
|
for (int i = 0; i < rank; ++i) {
|
|
PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
|
|
if (!PyLong_Check(item)) {
|
|
PyErr_SetString(PyExc_TypeError, "block size must be an int");
|
|
goto cleanup;
|
|
}
|
|
blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item);
|
|
}
|
|
|
|
shapeFast = PySequence_Fast(shape, "shape must be a sequence");
|
|
if (!shapeFast)
|
|
goto cleanup;
|
|
|
|
if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
|
|
PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
|
|
goto cleanup;
|
|
}
|
|
for (int i = 0; i < rank; ++i) {
|
|
PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
|
|
if (!PyLong_Check(item)) {
|
|
PyErr_SetString(PyExc_TypeError, "shape must be an int");
|
|
goto cleanup;
|
|
}
|
|
shapeInt[rank - i - 1] = PyLong_AsLong(item);
|
|
}
|
|
|
|
stridesFast = PySequence_Fast(strides, "strides must be a sequence");
|
|
if (!stridesFast)
|
|
goto cleanup;
|
|
|
|
if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
|
|
PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
|
|
goto cleanup;
|
|
}
|
|
for (int i = 0; i + 1 < rank; ++i) {
|
|
PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
|
|
if (!PyLong_Check(item)) {
|
|
PyErr_SetString(PyExc_TypeError, "shape must be an int");
|
|
goto cleanup;
|
|
}
|
|
stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item);
|
|
}
|
|
stridesLL[rank - 1] =
|
|
shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
|
|
Py_DECREF(blockSizeFast);
|
|
blockSizeFast = NULL;
|
|
Py_DECREF(shapeFast);
|
|
shapeFast = NULL;
|
|
Py_DECREF(stridesFast);
|
|
stridesFast = NULL;
|
|
|
|
CUtensorMapFloatOOBfill fill =
|
|
(padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
|
|
: CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
|
|
|
|
uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
|
|
static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
|
|
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
|
|
getCuTensorMapEncodeTiledHandle);
|
|
CUDA_CHECK_AND_RETURN_NULL(cuTensorMapEncodeTiled(
|
|
&desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
|
|
stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
|
|
swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill));
|
|
|
|
return (PyObject *)desc;
|
|
|
|
cleanup:
|
|
Py_XDECREF(blockSizeFast);
|
|
Py_XDECREF(shapeFast);
|
|
Py_XDECREF(stridesFast);
|
|
Py_XDECREF(desc);
|
|
return NULL;
|
|
}
|
|
|
|
static PyMethodDef ModuleMethods[] = {
|
|
{"load_binary", loadBinary, METH_VARARGS,
|
|
"Load provided cubin into CUDA driver"},
|
|
{"get_device_properties", getDeviceProperties, METH_VARARGS,
|
|
"Get the properties for a given device"},
|
|
{"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS,
|
|
"Python interface for cuOccupancyMaxActiveClusters function"},
|
|
{"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
|
|
"Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
|
|
"controls how many bytes can be streamed from kernels before data starts "
|
|
"being dropped. This inherits all the limitations of this call; in "
|
|
"particular it's an error to change this value after launching any kernel "
|
|
"that calls printf()."},
|
|
{"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"},
|
|
|
|
{NULL, NULL, 0, NULL} // sentinel
|
|
};
|
|
|
|
static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
|
|
NULL, // documentation
|
|
-1, // size
|
|
ModuleMethods};
|
|
|
|
PyMODINIT_FUNC PyInit_cuda_utils(void) {
|
|
if (PyType_Ready(&PyCUtensorMapType) < 0) {
|
|
return NULL;
|
|
}
|
|
|
|
PyObject *m = PyModule_Create(&ModuleDef);
|
|
if (m == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
PyModule_AddFunctions(m, ModuleMethods);
|
|
Py_INCREF(&PyCUtensorMapType);
|
|
PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
|
|
|
|
return m;
|
|
}
|