This is Rust version of ‘Hello Sort’ example application.

1. Build Your Compute Kernel

Prepare source code to offload.
mu_sort.rs file is implemented using the sort_unstable method. You can customize the algorithm in this file as needed.

#![no_main]
extern crate mu_macro;
use mu_macro::mu_kernel;

extern crate mu_lib;
use mu_lib::task::get_task_idx;
use mu_lib::uart_println;

#[mu_kernel]
pub fn sort_with_tensor(arr: *mut i32, size: usize) {
    let arr = unsafe { std::slice::from_raw_parts_mut(arr, size) };
    arr.sort_unstable();
    uart_println!("Sorted array: {:?}", arr);
}

#[mu_kernel]
pub fn sort_with_ptr(arr: *mut i32, size: usize) {
    let task_idx = get_task_idx();

    let offset = (task_idx as usize) * size;

    unsafe {
        let cur_array_ptr = arr.add(offset);
        let cur_slice = std::slice::from_raw_parts_mut(cur_array_ptr, size);
        cur_slice.sort_unstable();
    }
}

Note: #[mu_kernel] must be included to initialize the kernel properly.

2. Create Your Host Application

Prepare source code of host application.

  1. Include PXL module
     use pxl::execute;
    
  2. Configure parallel processing arguments.
    sort_size is number of elements to process per task. test_count is number of parallel tasks.
     let sort_size: u32 = 64;
     let test_count: u32 = 1024;
     let num_data = (test_count * sort_size) as usize;
    
  3. Setup PXL runtime instances.
     let context = pxl::runtime::create_context(0);
     let job = context.create_job(1);        
    
  4. Prepare map execution.
     let module = pxl::create_module("mu_kernel/mu_kernel.mubin");
     let mu_function = module.create_function("sort_with_ptr");
     job.load("mu_kernel/mu_kernel.mubin");
     let map = job.build_map(&mu_function, &(test_count as usize));
    
  5. Allocate device memory, and initialize data.
     let data = context.mem_alloc(num_data * std::mem::size_of::<i32>()) as *mut i32;
     unsafe {
         let data_slice = std::slice::from_raw_parts_mut(data, num_data);
    
         for i in 0..test_count {
             for j in 0..sort_size {
                 data_slice[(i as usize) * (sort_size as usize) + (j as usize)] =
                     (sort_size - j) as i32;
             }
         }
     }
    

    initialize input data as needed.

  6. Flush host cache (For CXL memory w/ CXL2.0 Host)
    cache flushing API will be added in future.
  7. Execute the kernel.
     let ret = execute!(map(data, sort_size));
     if ret == false {
         println!("Map execution failed");
         std::process::exit(1);
     }
     let ret = map.synchronize();
     if ret == false {
         println!("Map synchronization failed");
         std::process::exit(1);
     }
    

    For asynchronous execution, use success callback as following.

     use std::ffi::c_void;
     use std::sync::{
         atomic::{AtomicBool, Ordering},
         Arc,
     };
     use std::thread;
     use std::time::Duration;
        
     fn main() {
         ...
         unsafe extern "C" fn success_callback(ptr: *mut c_void) {
             let flag = ptr as *const AtomicBool;
             println!("✅ Success callback called with ptr: {:?}", flag);
             (*flag).store(true, Ordering::SeqCst);
         }
        
         let done_flag = Arc::new(AtomicBool::new(false));
         let done_ptr = Arc::into_raw(done_flag.clone()) as *mut c_void;
        
         map.set_success_callback(success_callback as *mut c_void, done_ptr);
        
         let ret = execute!(map(data, sort_size));
         if ret == false {
             println!("Map execution failed");
             std::process::exit(1);
         }
        
         while !done_flag.load(Ordering::SeqCst) {
             thread::sleep(Duration::from_millis(1)); // avoid busy loop
         }
         ...
     }
    

    You can find full example source code in examples/sort/sort_sync.rs or examples/sort/sort_async.rs.

3. Build Your Application

./build_rs.sh

This script automatically builds both the MU kernel and host application in one step.

4. Run and Check

  1. Run executable
     cargo run --bin sort_sync
    
     cargo run --bin sort_async
    

    You can run them simply using rust-analyzer in vscode environment.

  2. Verify results
    Check the output data to confirm that the offloading process works correctly. If any element is not sorted correctly, an error message will be displayed.
     Number of devices: 1
     Job loaded
     🎉 Test done.