DriverTrac/venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.c
2025-11-28 09:08:33 +05:30

477 lines
17 KiB
C

#include "cuda.h"
#include <dlfcn.h>
#include <stdbool.h>
#include <stdlib.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>
typedef struct {
PyObject_HEAD;
_Alignas(128) CUtensorMap tensorMap;
} PyCUtensorMapObject;
// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
static bool gpuAssert(CUresult code, const char *file, int line) {
if (code == CUDA_SUCCESS)
return true;
const char *prefix = "Triton Error [CUDA]: ";
const char *str;
cuGetErrorString(code, &str);
char err[1024] = {0};
strcat(err, prefix);
strcat(err, str);
PyGILState_STATE gil_state;
gil_state = PyGILState_Ensure();
PyErr_SetString(PyExc_RuntimeError, err);
PyGILState_Release(gil_state);
return false;
}
// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
#define CUDA_CHECK_AND_RETURN_NULL(ans) \
do { \
if (!gpuAssert((ans), __FILE__, __LINE__)) \
goto cleanup; \
} while (0)
// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
#define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \
do { \
if (!gpuAssert((ans), __FILE__, __LINE__)) { \
PyEval_RestoreThread(_save); \
return NULL; \
} \
} while (0)
// Used to check if functions exist in old CUDA driver versions.
#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \
do { \
if ((funcPointer) == NULL) { \
(funcPointer) = (initializerFunction)(); \
if ((funcPointer) == NULL) { \
goto cleanup; \
} \
} \
} while (0)
static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
int device_id;
if (!PyArg_ParseTuple(args, "i", &device_id))
return NULL;
// Get device handle
CUdevice device;
cuDeviceGet(&device, device_id);
// create a struct to hold device properties
int max_shared_mem;
int max_num_regs;
int multiprocessor_count;
int warp_size;
int sm_clock_rate;
int mem_clock_rate;
int mem_bus_width;
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
device));
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
CUDA_CHECK_AND_RETURN_NULL(
cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
&mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
max_shared_mem, "max_num_regs", max_num_regs,
"multiprocessor_count", multiprocessor_count, "warpSize",
warp_size, "sm_clock_rate", sm_clock_rate,
"mem_clock_rate", mem_clock_rate, "mem_bus_width",
mem_bus_width);
cleanup:
return NULL;
}
static PyObject *loadBinary(PyObject *self, PyObject *args) {
const char *name;
const char *data;
Py_ssize_t data_size;
int shared;
int device;
if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
&device)) {
return NULL;
}
CUfunction fun;
CUmodule mod;
int32_t n_regs = 0;
int32_t n_spills = 0;
int32_t n_max_threads = 0;
// create driver handles
CUcontext pctx = 0;
Py_BEGIN_ALLOW_THREADS;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx));
if (!pctx) {
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuDevicePrimaryCtxRetain(&pctx, device));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx));
}
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuModuleGetFunction(&fun, mod, name));
// get allocated registers and spilled registers from the function
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
n_spills /= 4;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
&n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
// set dynamic shared memory if necessary
int shared_optin;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
device));
if (shared > 49152 && shared_optin > 49152) {
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
int shared_total, shared_static;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
device));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
shared_optin - shared_static));
}
Py_END_ALLOW_THREADS;
if (PyErr_Occurred()) {
return NULL;
}
return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
n_spills, n_max_threads);
}
typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
int *numClusters, CUfunction func, const CUlaunchConfig *config);
typedef CUresult (*cuTensorMapEncodeTiled_t)(
CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill);
#define defineGetFunctionHandle(name, symbolName) \
static symbolName##_t name() { \
/* Open the shared library */ \
void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY); \
if (!libHandle) { \
PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1"); \
return NULL; \
} \
/* Clear any existing error */ \
dlerror(); \
symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \
/* Check for errors */ \
const char *err = dlerror(); \
if (err) { \
PyErr_SetString(PyExc_RuntimeError, \
"Failed to retrieve " #symbolName " from libcuda.so.1"); \
dlclose(libHandle); \
return NULL; \
} \
return funcHandle; \
}
defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
cuOccupancyMaxActiveClusters);
defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
cuTensorMapEncodeTiled);
static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
int clusterDimX = -1, clusterDimY = -1, clusterDimZ = -1,
maxActiveClusters = -1;
int shared = 0;
CUfunction func;
if (!PyArg_ParseTuple(args, "Kiiii", &func, &shared, &clusterDimX,
&clusterDimY, &clusterDimZ)) {
return NULL;
}
// Let each SM have one block
int maxActiveBlocks = 1;
Py_BEGIN_ALLOW_THREADS;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared));
Py_END_ALLOW_THREADS;
CUlaunchAttribute launchAttr[1];
launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
launchAttr[0].value.clusterDim.x = clusterDimX;
launchAttr[0].value.clusterDim.y = clusterDimY;
launchAttr[0].value.clusterDim.z = clusterDimZ;
CUlaunchConfig config;
config.gridDimX = clusterDimX;
config.gridDimY = maxActiveBlocks * clusterDimY;
config.gridDimZ = clusterDimZ;
config.blockDimX = 128;
config.blockDimY = 1;
config.blockDimZ = 1;
config.sharedMemBytes = shared;
config.hStream = 0;
config.numAttrs = 1;
config.attrs = launchAttr;
static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
getCuOccupancyMaxActiveClustersHandle);
Py_BEGIN_ALLOW_THREADS;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
Py_END_ALLOW_THREADS;
return PyLong_FromLong(maxActiveClusters);
cleanup:
return NULL;
}
static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
long size;
if (!PyArg_ParseTuple(args, "l", &size)) {
return NULL;
}
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
return NULL;
}
Py_BEGIN_ALLOW_THREADS;
// Ensure we have an active context.
CUcontext ctx = NULL;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx));
if (!ctx) {
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuDevicePrimaryCtxRetain(&ctx, /*device=*/0));
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx));
}
// We can't set the fifo size after running a kernel that calls printf. This
// is true even if the set() call is a nop and the new size is the same as the
// old size.
//
// This is unfriendly, so check if the old size matches the new size, and skip
// the set() call if so.
size_t oldSize = 0;
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
if (oldSize != size) {
CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
}
Py_END_ALLOW_THREADS;
Py_RETURN_NONE;
}
static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
PyCUtensorMapObject *self = NULL;
void *mem = NULL;
size_t size = type->tp_basicsize;
if (posix_memalign(&mem, 128, size) != 0) {
PyErr_NoMemory();
return NULL;
}
self = (PyCUtensorMapObject *)mem;
PyObject_INIT(self, type);
return (PyObject *)self;
}
static void PyCUtensorMap_dealloc(PyObject *self) {
Py_TYPE(self)->tp_free(self);
}
static void PyCUtensorMap_free(void *ptr) { free(ptr); }
// clang-format off
static PyTypeObject PyCUtensorMapType = {
PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "triton.backends.nvidia.PyCUtensorMap",
.tp_basicsize = sizeof(PyCUtensorMapObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT,
.tp_doc = "<PyCUtensorMap object>",
.tp_new = PyType_GenericNew,
.tp_alloc = PyCUtensorMap_alloc,
.tp_dealloc = (destructor)PyCUtensorMap_dealloc,
.tp_free = PyCUtensorMap_free,
};
// clang-format on
static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
unsigned long long global_address;
int swizzle;
int elemSize;
int elemType;
PyObject *blockSize;
PyObject *shape;
PyObject *strides;
int padding;
if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
&elemType, &blockSize, &shape, &strides, &padding)) {
return NULL;
}
PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
(PyObject *)&PyCUtensorMapType, NULL);
if (!desc) {
return NULL;
}
PyObject *blockSizeFast = NULL;
PyObject *shapeFast = NULL;
PyObject *stridesFast = NULL;
uint32_t blockSizeInt[5];
uint64_t shapeInt[5];
uint64_t stridesLL[5];
blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
if (!blockSizeFast)
goto cleanup;
int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
for (int i = 0; i < rank; ++i) {
PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
if (!PyLong_Check(item)) {
PyErr_SetString(PyExc_TypeError, "block size must be an int");
goto cleanup;
}
blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item);
}
shapeFast = PySequence_Fast(shape, "shape must be a sequence");
if (!shapeFast)
goto cleanup;
if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
goto cleanup;
}
for (int i = 0; i < rank; ++i) {
PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
if (!PyLong_Check(item)) {
PyErr_SetString(PyExc_TypeError, "shape must be an int");
goto cleanup;
}
shapeInt[rank - i - 1] = PyLong_AsLong(item);
}
stridesFast = PySequence_Fast(strides, "strides must be a sequence");
if (!stridesFast)
goto cleanup;
if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
goto cleanup;
}
for (int i = 0; i + 1 < rank; ++i) {
PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
if (!PyLong_Check(item)) {
PyErr_SetString(PyExc_TypeError, "shape must be an int");
goto cleanup;
}
stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item);
}
stridesLL[rank - 1] =
shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
Py_DECREF(blockSizeFast);
blockSizeFast = NULL;
Py_DECREF(shapeFast);
shapeFast = NULL;
Py_DECREF(stridesFast);
stridesFast = NULL;
CUtensorMapFloatOOBfill fill =
(padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
: CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
getCuTensorMapEncodeTiledHandle);
CUDA_CHECK_AND_RETURN_NULL(cuTensorMapEncodeTiled(
&desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill));
return (PyObject *)desc;
cleanup:
Py_XDECREF(blockSizeFast);
Py_XDECREF(shapeFast);
Py_XDECREF(stridesFast);
Py_XDECREF(desc);
return NULL;
}
static PyMethodDef ModuleMethods[] = {
{"load_binary", loadBinary, METH_VARARGS,
"Load provided cubin into CUDA driver"},
{"get_device_properties", getDeviceProperties, METH_VARARGS,
"Get the properties for a given device"},
{"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS,
"Python interface for cuOccupancyMaxActiveClusters function"},
{"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
"Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
"controls how many bytes can be streamed from kernels before data starts "
"being dropped. This inherits all the limitations of this call; in "
"particular it's an error to change this value after launching any kernel "
"that calls printf()."},
{"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"},
{NULL, NULL, 0, NULL} // sentinel
};
static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
NULL, // documentation
-1, // size
ModuleMethods};
PyMODINIT_FUNC PyInit_cuda_utils(void) {
if (PyType_Ready(&PyCUtensorMapType) < 0) {
return NULL;
}
PyObject *m = PyModule_Create(&ModuleDef);
if (m == NULL) {
return NULL;
}
PyModule_AddFunctions(m, ModuleMethods);
Py_INCREF(&PyCUtensorMapType);
PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
return m;
}