3 releases (breaking)

0.2.0	Dec 22, 2021
0.1.0	Nov 4, 2021
0.0.0	Sep 19, 2021

#691 in Concurrency

EUPL-1.2

500KB
1K SLoC

gpgpu

An experimental async GPU compute library based on wgpu. It is meant to be used alongside wgpu if desired.

To start using gpgpu, just create a Framework instance and follow the examples in the main repository.

Example

Small program that multiplies 2 vectors A and B; and stores the result in another vector C.

Rust program

    use gpgpu::*;
fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Framework initialization
    let fw = Framework::default();

    // Original CPU data
    let cpu_data = (0..10000).into_iter().collect::<Vec<u32>>();

    // GPU buffer creation
    let buf_a = GpuBuffer::from_slice(&fw, &cpu_data);       // Input
    let buf_b = GpuBuffer::from_slice(&fw, &cpu_data);       // Input
    let buf_c = GpuBuffer::<u32>::with_capacity(&fw, cpu_data.len() as u64);  // Output

    // Shader load from SPIR-V binary file
    let shader = Shader::from_spirv_file(&fw, "<SPIR-V shader path>")?;
    //  or from a WGSL source file
    let shader = Shader::from_wgsl_file(&fw, "<WGSL shader path>")?;    

    // Descriptor set and program creation
    let desc = DescriptorSet::default()
        .bind_buffer(&buf_a, GpuBufferUsage::ReadOnly)
        .bind_buffer(&buf_b, GpuBufferUsage::ReadOnly)
        .bind_buffer(&buf_c, GpuBufferUsage::ReadWrite);
    let program = Program::new(&shader, "main").add_descriptor_set(desc); // Entry point

    // Kernel creation and enqueuing
    Kernel::new(&fw, program).enqueue(cpu_data.len() as u32, 1, 1); // Enqueuing, not very optimus 😅

    let output = buf_c.read_vec_blocking()?;                        // Read back C from GPU
    for (a, b) in cpu_data.into_iter().zip(output) {
        assert_eq!(a.pow(2), b);
    }

    Ok(())
}

Shader program

The shader is written in WGSL

// Vector type definition. Used for both input and output
struct Vector {
    data: [[stride(4)]] array<u32>;
};

// A, B and C vectors
[[group(0), binding(0)]] var<storage, read>  a: Vector;
[[group(0), binding(1)]] var<storage, read>  b: Vector;
[[group(0), binding(2)]] var<storage, read_write> c: Vector;

[[stage(compute), workgroup_size(1)]]
fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
    c.data[global_id.x] = a.data[global_id.x] * b.data[global_id.x];
}

Dependencies

~10–22MB
~310K SLoC