K-Nearest Neighbors (KNN) Implementation Tutorial with PXL

This tutorial demonstrates how to classify data using the K-Nearest Neighbors (KNN) algorithm with the PXL framework. We will cover the implementation of KNN using both Tensor and Pointer approaches with PXL’s Map for parallel processing.

Prerequisites:
Before starting, it is recommended to review the following tutorials:

SDK Workflow: Provides an overview of the SDK’s architecture and workflow.

Get Started: Guides you through setting up your development environment.

Hello Sort: Introduces basic concepts using a simple sorting example.

Emulator on Docker: Guides you through setting up the emulator on Docker.

Overview

The KNN algorithm works by:

Finding the K nearest neighbors to a target point.
Taking a majority vote of their labels to classify the target.

Our implementation includes three main approaches:

Host Implementation: Uses the CPU for computation.
Implementation using Tensor: Utilizes PXL’s Map execution mode to efficiently process data in parallel through tensor operations.
Implementation using Pointer: Directly manages memory with pointers to distribute computation tasks across multiple parallel processing units.

Important Notes

Ensure that QEMU is started with the --no-kvm option when running the application in a QEMU environment to avoid issues.

cd emulator
./run.sh --no-kvm

Detailed Implementation

1. Build Your Compute Kernel

Write Distance Calculation Functions

The kernel code, implemented in mu_knn.cpp, calculates distances using two metrics:

Euclidean Distance
Dot Product

 #include <algorithm>
 #include <utility>
 #include <vector>

 #include "mu/mu.hpp"
 #include "mu_knn_config.hpp"

 // Calculate Euclidean Distance
 void calculate_euclidean_distance(float* point0, float* point1, int pointSize, float* result)
 {
     float distance = 0;
     for (int i = 0; i < pointSize; i++)
     {
         distance += (point0[i] - point1[i]) * (point0[i] - point1[i]);
     }
     *result = distance;
 }

 // Calculate Dot Product
 void calculate_dot_product(float* point0, float* point1, int pointSize, float* result)
 {
     float distance = 0;
     for (int i = 0; i < pointSize; i++)
     {
         distance += point0[i] * point1[i];
     }
     *result = distance;
 }

Implement KNN Functions

KNN with Tensor

 void knn_with_tensor(float* data, int* label, float* target, int k, int dim, int numPoints, float* result, int* labelResult, bool useDotProduct)
 {
     std::vector<std::pair<float, int>> distances(numPoints);
     for (int i = 0; i < numPoints; i++)
     {
         float distance = 0;
         if (useDotProduct)
         {
             calculate_dot_product(data + i * dim, target, dim, &distance);
         }
         else
         {
             calculate_euclidean_distance(data + i * dim, target, dim, &distance);
         }
         distances[i] = std::make_pair(distance, i * dim);
     }

     std::sort(distances.begin(), distances.end());

     for (int i = 0; i < k; i++)
     {
         for (int j = 0; j < dim; j++)
         {
             result[i * dim + j] = data[distances[i].second + j];
         }

         labelResult[i] = label[distances[i].second / dim];
     }
 }

KNN with Pointer

 void knn_with_ptr(float* data, int* label, float* target, const knnConfig& config, int numTasks, float* result, int* labelResult)
 {
     auto taskIdx = mu::getTaskIdx();
     int chunkSize = config.dataCount / numTasks;
     int start = taskIdx * chunkSize;
     int end = (taskIdx == numTasks - 1) ? config.dataCount : start + chunkSize;

     std::vector<std::pair<float, int>> distances(end - start);

     for (int i = start; i < end; i++)
     {
         float distance = 0;
         if (config.useDotProduct)
         {
             calculate_dot_product(data + i * config.dimension, target, config.dimension, &distance);
         }
         else
         {
             calculate_euclidean_distance(data + i * config.dimension, target, config.dimension, &distance);
         }

         distances[i - start] = std::make_pair(distance, i * config.dimension);
     }

     std::sort(distances.begin(), distances.end());

     for (int i = 0; i < config.k; i++)
     {
         for (int j = 0; j < config.dimension; j++)
         {
             int dataIndex = distances[i].second + j;
             result[taskIdx * config.k * config.dimension + i * config.dimension + j] = data[dataIndex];
         }
         labelResult[taskIdx * config.k + i] = label[distances[i].second / config.dimension];
     }
 }

 MU_KERNEL_ADD(knn_with_tensor)
 MU_KERNEL_ADD(knn_with_ptr)

2. Create Your Host Application

Setup PXL Runtime Instances

You can initialize PXL runtime instances as shown below:

 uint32_t deviceId = 0;
 auto ctx = pxl::runtime::createContext(deviceId);
 auto job = ctx->createJob();
 auto module = pxl::createModule(filename);
 job->load(module);

Alternatively, use a custom PxlResources struct to manage the lifecycle of PXL resources. The PxlResources struct is implemented in pxl_helper.hpp.

 struct PxlResources {
     pxl::runtime::Context* context;
     pxl::runtime::Job* job;
     pxl::runtime::Map* executor;

     ~PxlResources() {
         if (context) {
             if (job) {
                 context->destroyJob(job);
             }
             pxl::runtime::destroyContext(context);
         }
     }
 };

 int main(int argc, char* argv[])
 {
     const uint32_t numTasks = 4;
     auto pxlResources = initializePxl(numTasks, Tensor); //or Ptr

     ... 
 }

Setup Data

Reading Data and configuring KNN parameters

First, read data from the input files using a helper function. helper function is implemented in helper.hpp

 std::string dataFilename = "knn_data/dim_5/data.txt";
 std::string targetFilename = "knn_data/dim_5/targets.txt";

 auto [data, label] = readData<float>(dataFilename);
 auto target = readTarget<float>(targetFilename);


 const uint32_t k = 3;
 const bool useDotProduct = false;
 const uint32_t dimension = data[0].size();
 const uint32_t dataCount = data.size();
 knnConfig config = {k, dimension, dataCount, useDotProduct};

Allocating Device Memory

You can allocate device memory as in a similar manner as demonstrated in the Hello Sort tutorial. In this tutorial, we use the helper function allocatePxlMemory along with a custom deleter implemented through the PxlMemoryDeleter struct. This ensures that memory allocated by PXL is properly freed when it is no longer needed. The allocatePxlMemory function is a template that returns a PxlUniquePtr, which is a std::unique_ptr with a custom deleter. This deleter ensures that the allocated memory is freed using the PXL runtime’s memFree function.

 template <typename T>
 struct PxlMemoryDeleter
 {
     pxl::runtime::Context* context;
     PxlMemoryDeleter(pxl::runtime::Context* ctx)
         : context(ctx)
     {
         if (!ctx)
             throw std::invalid_argument("Null context provided to PxlMemoryDeleter");
     }
     void operator()(T* ptr)
     {
         if (ptr && context)
             context->memFree(ptr);
     }
 };

 template <typename T>
 using PxlUniquePtr = std::unique_ptr<T, PxlMemoryDeleter<T>>;

 template <typename T>
 PxlUniquePtr<T> allocatePxlMemory(pxl::runtime::Context* context, size_t count)
 {
     T* ptr = reinterpret_cast<T*>(context->memAlloc(count * sizeof(T)));
     if (!ptr)
     {
         throw std::runtime_error("Failed to allocate PXL memory");
     }
     return PxlUniquePtr<T>(ptr, PxlMemoryDeleter<T>(context));
 }

allocate memory for data and label

 int main(int argc, char* argv[])
 {
     ...

     auto muData = allocatePxlMemory<float>(pxlResources->context, dataCount * dimension);
     auto dataTensor = pxl::runtime::Tensor<float>(
         muData.get(), 
         {static_cast<size_t>(numTasks), (dataCount * dimension) / static_cast<size_t>(numTasks)}
     ); //in case of MAP

     auto muLabel = allocatePxlMemory<int>(pxlResources->context, dataCount);
     auto labelTensor = pxl::runtime::Tensor<int>(
         muLabel.get(), 
         {static_cast<size_t>(numTasks), (dataCount) / static_cast<size_t>(numTasks)}
     ); //in case of MAP

     ...
        

Write data to device memory

     for (size_t i = 0; i < dataCount; i++)
     {
         std::memcpy(muData.get() + i * dimension, data[i].data(), dimension * sizeof(float));
     }

     std::memcpy(muLabel.get(), label.data(), dataCount * sizeof(int));

     // For CXL2.0 host, Flush host cache to the device. You can skip on QEMU-SIM env.
     pxl::flushHostCache(muData.get(), dataCount * dimension * sizeof(float));
     pxl::flushHostCache(muLabel.get(), dataCount * sizeof(int));

     ...
 }

Run KNN

 // Iterate over each target
 for (size_t i = 0; i < target.size(); i++)
 {
     std::vector<std::vector<float>> executorResult(numTasks * config.k, std::vector<float>(config.dimension));
     std::vector<int> labelResult(numTasks * config.k);

     knnTensor(pxlResources->context, pxlResources->executor, dataTensor, labelTensor, target[i], config, numTasks, executorResult, labelResult); // For Tensor implementation
     knnPtr(pxlResources->context, pxlResources->executor, muData.get(), muLabel.get(), target[i], config, numTasks, executorResult, labelResult); // For Pointer implementation

     // Validate and process results
     ...
 }

Implementation using Tensor The knnTensor function utilizes tensor operations to execute the KNN algorithm. This includes:

Preparing the necessary tensors (target tensor, result tensor, and label tensor).
Executing the Map operation via the executor->execute function.
Converts the results into a format that can be returned to the host.

 void knnTensor(pxl::runtime::Context* context, pxl::runtime::Map* executor, pxl::runtime::Tensor<float> dataTensor,
             pxl::runtime::Tensor<int> labelTensor, const std::vector<float>& target,
             const knnConfig& config, int numTasks,
             std::vector<std::vector<float>>& result, std::vector<int>& labelResult)
 {
     // Allocate and prepare target tensor, notice here we allocate target memory for each task because we need same target for each task
     auto muTarget = allocatePxlMemory<float>(context, numTasks * config.dimension);
     auto targetTensor = pxl::runtime::Tensor<float>(muTarget.get(), {static_cast<size_t>(numTasks), config.dimension});
     for (int j = 0; j < numTasks; j++)
     {
         std::memcpy(muTarget.get() + j * config.dimension, target.data(), config.dimension * sizeof(float));
     }

     // Allocate result tensor
     auto muResult = allocatePxlMemory<float>(context, numTasks * config.dimension * config.k);
     auto resultTensor = pxl::runtime::Tensor<float>(muResult.get(),
                                                     {static_cast<size_t>(numTasks), config.dimension * static_cast<size_t>(config.k)});

     // Allocate label result tensor
     auto muLabelResult = allocatePxlMemory<int>(context, numTasks * config.k);
     auto labelResultTensor = pxl::runtime::Tensor<int>(muLabelResult.get(),
                                                     {static_cast<size_t>(numTasks), static_cast<size_t>(config.k)});

     // For CXL2.0 host, Flush host cache to the device. You can skip on QEMU-SIM env.
     pxl::flushHostCache(muTarget.get(), numTasks * config.dimension * sizeof(float));
     pxl::flushHostCache(muResult.get(), numTasks * config.dimension * config.k * sizeof(float));
     pxl::flushHostCache(muLabelResult.get(), numTasks * config.k * sizeof(int));

     // Execute the map operation
     if (!executor->execute(dataTensor, labelTensor, targetTensor, config.k, config.dimension, config.dataCount / numTasks, resultTensor, labelResultTensor, config.useDotProduct))
     {
         throw std::runtime_error("Map Execute Failed");
     }

     if (!executor->synchronize())
     {
         throw std::runtime_error("Map Synchronize Failed");
     }

     // Convert result to return format
     for (int i = 0; i < numTasks * config.k; i++)
     {
         for (int j = 0; j < config.dimension; j++)
         {
             result[i][j] = muResult.get()[i * config.dimension + j];
         }
         labelResult[i] = muLabelResult.get()[i];
     }
 }

Note: executor automatically divides the data into chunks and distributes them to each task. As a result, the target tensor is divided into chunks. Ensure that the target is correctly written to each chunk.

Implementation using Pointer The knnPtr function utilizes direct pointer operations to execute the KNN algorithm. This method divides the workload among multiple tasks to achieve efficient parallel computation.

Allocates and prepares the necessary memory for the target.
Executes the operation via the executor->execute function.
Converts the results into a format that can be returned to the host.

 void knnPtr(pxl::runtime::Context* context, pxl::runtime::Map* executor, float* data,
                 int* label, const std::vector<float>& target,
                 const knnConfig& config, int numTasks,
                 std::vector<std::vector<float>>& result, std::vector<int>& labelResult)
 {
     // Allocate and prepare target tensor, ensuring the target memory is correctly written for processing
     auto muTarget = allocatePxlMemory<float>(context, config.dimension);
     std::memcpy(muTarget.get(), target.data(), config.dimension * sizeof(float));

     // Allocate result memory
     auto muResult = allocatePxlMemory<float>(context, numTasks * config.dimension * config.k);

     // Allocate label result memory
     auto muLabelResult = allocatePxlMemory<int>(context, numTasks * config.k);

     // For CXL2.0 host, Flush host cache to the device. You can skip on QEMU-SIM env.
     pxl::flushHostCache(muTarget.get(), config.dimension * sizeof(float));
     pxl::flushHostCache(muResult.get(), numTasks * config.dimension * config.k * sizeof(float));
     pxl::flushHostCache(muLabelResult.get(), numTasks * config.k * sizeof(int));

     // Execute the map operation
     if (!executor->execute(data, label, muTarget.get(), config, numTasks, muResult.get(), muLabelResult.get()))
     {
         throw std::runtime_error("Map Execute Failed");
     }

     if (!executor->synchronize())
     {
         throw std::runtime_error("Map Synchronize Failed");
     }

     // Convert result to return format
     for (int i = 0; i < numTasks * config.k; i++)
     {
         for (int j = 0; j < config.dimension; j++)
         {
             result[i][j] = muResult.get()[i * config.dimension + j];
         }
         labelResult[i] = muLabelResult.get()[i];
     }
 }

Validate and Process Results

Once the KNN algorithm has been executed in Map mode, the results need to be:

Validated against the CPU-based host results.
Processed for errors.
Displayed or logged as the final classified label.

The following code handles these tasks::

Validation: Compare the executorTopKResult and executorMajorityLabel from Map mode to the hostTopKResult and hostMajorityLabel from the host.
Error Handling: Log discrepancies using printErrorDetails.
Output: Print the final classified label to the console.

 // Get top k result from whole data
 auto [hostTopKResult, hostMajorityLabel] = getTopK(data, label, target[i], config.k, config.dimension, dataCount, config.useDotProduct);

 // Get top k from kernel result
 auto [executorTopKResult, executorMajorityLabel] = getTopK(executorResult, labelResult, target[i], config.k, config.dimension, numTasks * config.k, config.useDotProduct);

 if (executorTopKResult != hostTopKResult || executorMajorityLabel != hostMajorityLabel)
 {
     printErrorDetails<float>(directoryName, target[i], hostTopKResult, executorTopKResult, hostMajorityLabel, executorMajorityLabel, k);
 }
 else
 {
     std::cout << "Classified label : " << executorMajorityLabel[0] << std::endl;
 }

3. Build and Run

Build Build the application using the build.sh script.
```
 ./build.sh
```

Run executable and check results

 ./knn_with_tensor
 ./knn_with_ptr

 Classified label : 5