#include <spu_mfcio.h>
#include <libmisc.h>
#include "dma.h"

CONTROL_BLOCK cb __attribute__((aligned(16)));
vec_int4 *data;

int main(uint64_t speid, uint64_t argp, uint64_t envp)
{
  uint32_t data_size;

  // DMA in the control block so we know where to get the data from
  mfc_get(&cb,         // destination local store address
          argp,        // source main memory address
          sizeof(cb),  // number of bytes to transfer
          0,           // tag (0-31)
          0,           // (ignore)
          0);          // (ignore)

  // Wait for the DMA to complete. Mask is set for only tag 0.
  // mfc_read_tag_status_all() blocks until all DMA with the masked tags have
  // completed.
  mfc_write_tag_mask(1 << 0);
  mfc_read_tag_status_all();

  // Allocate a buffer to hold the data, aligned on a 2^7 (128) byte boundary
  // (no error checking!)
  data_size = cb.num_elements * ELEMENT_SIZE;
  data = malloc_align(data_size, 7);

  // DMA in the actual data
  mfc_get(data,          // dest LS addr
          cb.data_addr,  // source main memory addr
          data_size,     // number of bytes
          0,             // tag
          0,
          0);

  // Wait for DMA to complete. Since we're using the same tag (0), we don't
  // need to write the tag mask again.
  mfc_read_tag_status_all();

  // Do processing using vector intrinsics
  for (int i = 0; i < cb.num_elements / 4; i++) {
    // spu_madd does a vector multiply and add instruction (note: for integers
    // this is not quite 32-bit)
    data[i] = spu_madd(*(vec_short8 *)&data[i],
                       (vec_short8)(MUL_FACTOR),
                       (vec_int4)(ADD_FACTOR));
  }

  // DMA out processed data and wait for it to complete
  mfc_put(data,          // source LS addr
          cb.data_addr,  // dest main memory addr
          data_size,     // number of bytes
          0,             // tag
          0,
          0);
  mfc_read_tag_status_all();

  // Notify PPU via mailbox
  spu_write_out_mbox(0);
  return 0;
}
