* tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles.
43 lines
1.1 KiB
Plaintext
43 lines
1.1 KiB
Plaintext
#version 450
|
|
|
|
#include "types.comp"
|
|
#include "generic_unary_head.comp"
|
|
|
|
#extension GL_EXT_control_flow_attributes : require
|
|
|
|
const uint num_threads = 128;
|
|
|
|
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
void main() {
|
|
uint idx = get_idx();
|
|
|
|
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
|
const uint num_iter = 4;
|
|
|
|
// fast path for when all four iterations are in-bounds
|
|
if (idx + (num_iter-1)*num_threads < p.ne) {
|
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
|
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
|
#else
|
|
data_d[p.d_offset + idx] = data_a[idx];
|
|
#endif
|
|
idx += num_threads;
|
|
}
|
|
} else {
|
|
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
|
if (idx >= p.ne) {
|
|
continue;
|
|
}
|
|
|
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
|
data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
|
|
#else
|
|
data_d[p.d_offset + idx] = data_a[idx];
|
|
#endif
|
|
idx += num_threads;
|
|
}
|
|
}
|
|
}
|