70 lines
2.1 KiB
C
70 lines
2.1 KiB
C
/* clang-format off */
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <inttypes.h>
|
|
#include <string.h>
|
|
#include <cuda.h>
|
|
|
|
|
|
// helpers to check for cuda errors
|
|
#define CUDA_CHECK(ans) {{\
|
|
gpuAssert((ans), __FILE__, __LINE__);\
|
|
}}\
|
|
|
|
static inline void gpuAssert(CUresult code, const char *file, int line) {{
|
|
if (code != CUDA_SUCCESS) {{
|
|
const char *prefix = "Triton Error [CUDA]: ";
|
|
const char *str;
|
|
cuGetErrorString(code, &str);
|
|
char err[1024] = {{0}};
|
|
strcat(err, prefix);
|
|
strcat(err, str);
|
|
printf("%s\\n", err);
|
|
exit(code);
|
|
}}
|
|
}}
|
|
|
|
// globals
|
|
#define CUBIN_NAME {kernel_name}_cubin
|
|
CUmodule {kernel_name}_mod = NULL;
|
|
CUfunction {kernel_name}_func = NULL;
|
|
unsigned char CUBIN_NAME[{bin_size}] = {{ {bin_data} }};
|
|
|
|
|
|
void unload_{kernel_name}(void) {{
|
|
CUDA_CHECK(cuModuleUnload({kernel_name}_mod));
|
|
}}
|
|
|
|
// TODO: some code duplication with `runtime/backend/cuda.c`
|
|
void load_{kernel_name}() {{
|
|
int dev = 0;
|
|
void *bin = (void *)&CUBIN_NAME;
|
|
int shared = {shared};
|
|
CUDA_CHECK(cuModuleLoadData(&{kernel_name}_mod, bin));
|
|
CUDA_CHECK(cuModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
|
|
// set dynamic shared memory if necessary
|
|
int shared_optin;
|
|
CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev));
|
|
if (shared > 49152 && shared_optin > 49152) {{
|
|
CUDA_CHECK(cuFuncSetCacheConfig({kernel_name}_func, CU_FUNC_CACHE_PREFER_SHARED));
|
|
CUDA_CHECK(cuFuncSetAttribute({kernel_name}_func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin))
|
|
}}
|
|
}}
|
|
|
|
/*
|
|
{kernel_docstring}
|
|
*/
|
|
CUresult {kernel_name}(CUstream stream, {signature}) {{
|
|
if ({kernel_name}_func == NULL)
|
|
load_{kernel_name}();
|
|
unsigned int gX = {gridX};
|
|
unsigned int gY = {gridY};
|
|
unsigned int gZ = {gridZ};
|
|
CUdeviceptr global_scratch = 0;
|
|
CUdeviceptr profile_scratch = 0;
|
|
void *args[{num_args}] = {{ {arg_pointers} }};
|
|
// TODO: shared memory
|
|
if(gX * gY * gZ > 0)
|
|
return cuLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * 32, 1, 1, {shared}, stream, args, NULL);
|
|
}}
|