/*
 * Copyright (C) 2022 Pascal Nowack
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */

#include "config.h"

#include "grd-rdp-damage-detector-cuda.h"

#include "grd-hwaccel-nvidia.h"
#include "grd-rdp-legacy-buffer.h"

#define TILE_WIDTH 64
#define TILE_HEIGHT 64

typedef struct _GrdRdpDamageDetectorCuda
{
  GrdRdpDamageDetector parent;

  CudaFunctions *cuda_funcs;
  CUstream cuda_stream;

  CUfunction cu_chk_dmg_pxl;
  CUfunction cu_cmb_dmg_arr_cols;
  CUfunction cu_cmb_dmg_arr_rows;
  CUfunction cu_simplify_dmg_arr;

  uint32_t surface_width;
  uint32_t surface_height;

  uint32_t cols;
  uint32_t rows;

  GrdRdpLegacyBuffer *last_framebuffer;

  CUdeviceptr region_is_damaged;
  CUdeviceptr damage_array;
  CUdeviceptr simplified_damage_array;
} GrdRdpDamageDetectorCuda;

G_DEFINE_TYPE (GrdRdpDamageDetectorCuda,
               grd_rdp_damage_detector_cuda,
               GRD_TYPE_RDP_DAMAGE_DETECTOR)

static gboolean
invalidate_surface (GrdRdpDamageDetector *detector)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (detector);
  CudaFunctions *cuda_funcs = detector_cuda->cuda_funcs;
  uint32_t surface_width = detector_cuda->surface_width;
  uint32_t surface_height = detector_cuda->surface_height;

  g_clear_pointer (&detector_cuda->last_framebuffer,
                   grd_rdp_legacy_buffer_release);

  if (!detector_cuda->damage_array)
    return TRUE;

  if (cuda_funcs->cuMemsetD8Async (detector_cuda->damage_array,
                                   1, surface_width * surface_height,
                                   detector_cuda->cuda_stream) != CUDA_SUCCESS ||
      cuda_funcs->cuMemsetD8Async (detector_cuda->region_is_damaged, 1, 1,
                                   detector_cuda->cuda_stream) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to set memory");
      return FALSE;
    }

  return TRUE;
}

static void
clear_cuda_pointer (GrdRdpDamageDetectorCuda *detector_cuda,
                    CUdeviceptr              *device_ptr)
{
  if (!(*device_ptr))
    return;

  detector_cuda->cuda_funcs->cuMemFree (*device_ptr);
  *device_ptr = 0;
}

static gboolean
resize_surface (GrdRdpDamageDetector *detector,
                uint32_t              width,
                uint32_t              height)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (detector);
  CudaFunctions *cuda_funcs = detector_cuda->cuda_funcs;
  uint32_t cols;
  uint32_t rows;

  g_clear_pointer (&detector_cuda->last_framebuffer,
                   grd_rdp_legacy_buffer_release);

  clear_cuda_pointer (detector_cuda, &detector_cuda->simplified_damage_array);
  clear_cuda_pointer (detector_cuda, &detector_cuda->damage_array);

  detector_cuda->surface_width = width;
  detector_cuda->surface_height = height;

  cols = width / TILE_WIDTH + (width % TILE_WIDTH ? 1 : 0);
  rows = height / TILE_HEIGHT + (height % TILE_HEIGHT ? 1 : 0);
  detector_cuda->cols = cols;
  detector_cuda->rows = rows;

  if (cuda_funcs->cuMemAlloc (&detector_cuda->damage_array,
                              width * height) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to allocate damage array");
      return FALSE;
    }
  if (cuda_funcs->cuMemAlloc (&detector_cuda->simplified_damage_array,
                              cols * rows) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to allocate simplified damage array");
      return FALSE;
    }
  if (cuda_funcs->cuMemsetD8Async (detector_cuda->damage_array, 1, width * height,
                                   detector_cuda->cuda_stream) != CUDA_SUCCESS ||
      cuda_funcs->cuMemsetD8Async (detector_cuda->region_is_damaged, 1, 1,
                                   detector_cuda->cuda_stream) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to set memory");
      return FALSE;
    }

  return TRUE;
}

static gboolean
submit_new_framebuffer (GrdRdpDamageDetector *detector,
                        GrdRdpLegacyBuffer   *buffer)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (detector);
  CudaFunctions *cuda_funcs = detector_cuda->cuda_funcs;
  uint32_t surface_width = detector_cuda->surface_width;
  uint32_t surface_height = detector_cuda->surface_height;
  CUdeviceptr current_data;
  CUdeviceptr previous_data;
  unsigned int grid_dim_x, grid_dim_y, grid_dim_z;
  unsigned int block_dim_x, block_dim_y, block_dim_z;
  void *args[8];

  g_assert (detector_cuda->damage_array);

  if (!detector_cuda->last_framebuffer)
    {
      if (cuda_funcs->cuMemsetD8Async (detector_cuda->damage_array,
                                       1, surface_width * surface_height,
                                       detector_cuda->cuda_stream) != CUDA_SUCCESS ||
          cuda_funcs->cuMemsetD8Async (detector_cuda->region_is_damaged, 1, 1,
                                       detector_cuda->cuda_stream) != CUDA_SUCCESS)
        {
          g_warning ("[HWAccel.CUDA] Failed to set memory");
          return FALSE;
        }

      detector_cuda->last_framebuffer = buffer;

      return TRUE;
    }

  if (cuda_funcs->cuMemsetD8Async (detector_cuda->region_is_damaged, 0, 1,
                                   detector_cuda->cuda_stream) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to set memory");
      return FALSE;
    }

  current_data = grd_rdp_legacy_buffer_get_mapped_cuda_pointer (buffer);
  previous_data =
    grd_rdp_legacy_buffer_get_mapped_cuda_pointer (detector_cuda->last_framebuffer);

  /* Threads per blocks */
  block_dim_x = 32;
  block_dim_y = 16;
  block_dim_z = 1;
  /* Amount of blocks per grid */
  grid_dim_x = surface_width / block_dim_x +
               (surface_width % block_dim_x ? 1 : 0);
  grid_dim_y = surface_height / block_dim_y +
               (surface_height % block_dim_y ? 1 : 0);
  grid_dim_z = 1;

  args[0] = &detector_cuda->damage_array;
  args[1] = &detector_cuda->region_is_damaged;
  args[2] = &current_data;
  args[3] = &previous_data;
  args[4] = &surface_width;
  args[5] = &surface_width;
  args[6] = &surface_height;
  args[7] = &surface_width;
  if (cuda_funcs->cuLaunchKernel (detector_cuda->cu_chk_dmg_pxl,
                                  grid_dim_x, grid_dim_y, grid_dim_z,
                                  block_dim_x, block_dim_y, block_dim_z,
                                  0, detector_cuda->cuda_stream,
                                  args, NULL) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to launch CHK_DMG_PXL kernel");
      return FALSE;
    }

  g_clear_pointer (&detector_cuda->last_framebuffer,
                   grd_rdp_legacy_buffer_release);
  detector_cuda->last_framebuffer = buffer;

  return TRUE;
}

static gboolean
is_region_damaged (GrdRdpDamageDetector *detector)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (detector);
  CudaFunctions *cuda_funcs = detector_cuda->cuda_funcs;
  uint8_t is_damaged;

  g_assert (detector_cuda->damage_array);
  g_assert (detector_cuda->last_framebuffer);

  cuda_funcs->cuMemcpyDtoHAsync (&is_damaged, detector_cuda->region_is_damaged,
                                 1, detector_cuda->cuda_stream);
  cuda_funcs->cuStreamSynchronize (detector_cuda->cuda_stream);

  return !!is_damaged;
}

static cairo_region_t *
get_cairo_region (GrdRdpDamageDetectorCuda *detector_cuda,
                  uint8_t                  *simplified_damage_array)
{
  uint32_t surface_width = detector_cuda->surface_width;
  uint32_t surface_height = detector_cuda->surface_height;
  cairo_region_t *damage_region;
  cairo_rectangle_int_t tile;
  uint32_t x, y;

  damage_region = cairo_region_create ();
  for (y = 0; y < detector_cuda->rows; ++y)
    {
      for (x = 0; x < detector_cuda->cols; ++x)
        {
          if (simplified_damage_array[y * detector_cuda->cols + x])
            {
              tile.x = x * TILE_WIDTH;
              tile.y = y * TILE_HEIGHT;
              tile.width = surface_width - tile.x < TILE_WIDTH ? surface_width - tile.x
                                                               : TILE_WIDTH;
              tile.height = surface_height - tile.y < TILE_HEIGHT ? surface_height - tile.y
                                                                  : TILE_HEIGHT;

              cairo_region_union_rectangle (damage_region, &tile);
            }
        }
    }

  return damage_region;
}

static cairo_region_t *
get_damage_region (GrdRdpDamageDetector *detector)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (detector);
  CudaFunctions *cuda_funcs = detector_cuda->cuda_funcs;
  g_autofree uint8_t *simplified_damage_array_host = NULL;
  unsigned int grid_dim_x, grid_dim_y, grid_dim_z;
  unsigned int block_dim_x, block_dim_y, block_dim_z;
  uint32_t combine_shift[6];
  void *args_cols[5];
  void *args_rows[5];
  void *args_simplify[6];
  uint32_t i;

  g_assert (detector_cuda->damage_array);
  g_assert (detector_cuda->last_framebuffer);

  /* Threads per blocks */
  block_dim_x = 32;
  block_dim_y = 16;
  block_dim_z = 1;

  args_cols[0] = args_rows[0] = &detector_cuda->damage_array;
  args_cols[1] = args_rows[1] = &detector_cuda->surface_width;
  args_cols[2] = args_rows[2] = &detector_cuda->surface_height;
  args_cols[3] = args_rows[3] = &detector_cuda->surface_width;

  for (i = 0; i < 6; ++i)
    {
      uint32_t full_blocks;

      combine_shift[i] = i;
      args_cols[4] = &combine_shift[i];

      full_blocks = detector_cuda->surface_width >> (i + 1);

      /* Amount of blocks per grid */
      grid_dim_x = full_blocks / block_dim_x + 1;
      grid_dim_y = detector_cuda->surface_height / block_dim_y +
                   (detector_cuda->surface_height % block_dim_y ? 1 : 0);
      grid_dim_z = 1;

      if (cuda_funcs->cuLaunchKernel (detector_cuda->cu_cmb_dmg_arr_cols,
                                      grid_dim_x, grid_dim_y, grid_dim_z,
                                      block_dim_x, block_dim_y, block_dim_z,
                                      0, detector_cuda->cuda_stream,
                                      args_cols, NULL) != CUDA_SUCCESS)
        {
          g_warning ("[HWAccel.CUDA] Failed to launch CMB_DMG_ARR_COLS kernel");
          return NULL;
        }
    }

  for (i = 0; i < 6; ++i)
    {
      uint32_t full_blocks;

      args_rows[4] = &combine_shift[i];

      full_blocks = detector_cuda->surface_height >> (i + 1);

      /* Amount of blocks per grid */
      grid_dim_x = detector_cuda->surface_width / block_dim_x +
                   (detector_cuda->surface_width % block_dim_x ? 1 : 0);
      grid_dim_y = full_blocks / block_dim_y + 1;
      grid_dim_z = 1;

      if (cuda_funcs->cuLaunchKernel (detector_cuda->cu_cmb_dmg_arr_rows,
                                      grid_dim_x, grid_dim_y, grid_dim_z,
                                      block_dim_x, block_dim_y, block_dim_z,
                                      0, detector_cuda->cuda_stream,
                                      args_rows, NULL) != CUDA_SUCCESS)
        {
          g_warning ("[HWAccel.CUDA] Failed to launch CMB_DMG_ARR_ROWS kernel");
          return NULL;
        }
    }

  /* Amount of blocks per grid */
  grid_dim_x = detector_cuda->surface_width / block_dim_x +
               (detector_cuda->surface_width % block_dim_x ? 1 : 0);
  grid_dim_y = detector_cuda->surface_height / block_dim_y +
               (detector_cuda->surface_height % block_dim_y ? 1 : 0);
  grid_dim_z = 1;

  args_simplify[0] = &detector_cuda->simplified_damage_array;
  args_simplify[1] = &detector_cuda->damage_array;
  args_simplify[2] = &detector_cuda->cols;
  args_simplify[3] = &detector_cuda->surface_width;
  args_simplify[4] = &detector_cuda->surface_height;
  args_simplify[5] = &detector_cuda->surface_width;

  if (cuda_funcs->cuLaunchKernel (detector_cuda->cu_simplify_dmg_arr,
                                  grid_dim_x, grid_dim_y, grid_dim_z,
                                  block_dim_x, block_dim_y, block_dim_z,
                                  0, detector_cuda->cuda_stream,
                                  args_simplify, NULL) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to launch SIMPLIFY_DMG_ARR kernel");
      return NULL;
    }

  simplified_damage_array_host = g_malloc0 (detector_cuda->cols *
                                            detector_cuda->rows *
                                            sizeof (uint8_t));

  if (cuda_funcs->cuMemcpyDtoHAsync (simplified_damage_array_host,
                                     detector_cuda->simplified_damage_array,
                                     detector_cuda->cols * detector_cuda->rows,
                                     detector_cuda->cuda_stream) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to transfer simplified damage array");
      return NULL;
    }
  if (cuda_funcs->cuStreamSynchronize (detector_cuda->cuda_stream) != CUDA_SUCCESS)
    {
      g_warning ("[HWAccel.CUDA] Failed to synchronize stream");
      return NULL;
    }

  return get_cairo_region (detector_cuda, simplified_damage_array_host);
}

GrdRdpDamageDetectorCuda *
grd_rdp_damage_detector_cuda_new (GrdHwAccelNvidia *hwaccel_nvidia,
                                  CUstream          cuda_stream)
{
  g_autoptr (GrdRdpDamageDetectorCuda) detector_cuda = NULL;
  CudaFunctions *cuda_funcs;

  detector_cuda = g_object_new (GRD_TYPE_RDP_DAMAGE_DETECTOR_CUDA, NULL);
  detector_cuda->cuda_stream = cuda_stream;

  grd_hwaccel_nvidia_get_cuda_functions (hwaccel_nvidia,
                                         (gpointer *) &detector_cuda->cuda_funcs);
  grd_hwaccel_nvidia_get_cuda_damage_kernels (hwaccel_nvidia,
                                              &detector_cuda->cu_chk_dmg_pxl,
                                              &detector_cuda->cu_cmb_dmg_arr_cols,
                                              &detector_cuda->cu_cmb_dmg_arr_rows,
                                              &detector_cuda->cu_simplify_dmg_arr);

  cuda_funcs = detector_cuda->cuda_funcs;
  if (cuda_funcs->cuMemAlloc (&detector_cuda->region_is_damaged, 1) != CUDA_SUCCESS)
    return NULL;

  return g_steal_pointer (&detector_cuda);
}

static void
grd_rdp_damage_detector_cuda_dispose (GObject *object)
{
  GrdRdpDamageDetectorCuda *detector_cuda =
    GRD_RDP_DAMAGE_DETECTOR_CUDA (object);

  g_assert (!detector_cuda->last_framebuffer);

  clear_cuda_pointer (detector_cuda, &detector_cuda->simplified_damage_array);
  clear_cuda_pointer (detector_cuda, &detector_cuda->damage_array);
  clear_cuda_pointer (detector_cuda, &detector_cuda->region_is_damaged);

  G_OBJECT_CLASS (grd_rdp_damage_detector_cuda_parent_class)->dispose (object);
}

static void
grd_rdp_damage_detector_cuda_init (GrdRdpDamageDetectorCuda *detector_cuda)
{
}

static void
grd_rdp_damage_detector_cuda_class_init (GrdRdpDamageDetectorCudaClass *klass)
{
  GObjectClass *object_class = G_OBJECT_CLASS (klass);
  GrdRdpDamageDetectorClass *detector_class =
    GRD_RDP_DAMAGE_DETECTOR_CLASS (klass);

  object_class->dispose = grd_rdp_damage_detector_cuda_dispose;

  detector_class->invalidate_surface = invalidate_surface;
  detector_class->resize_surface = resize_surface;
  detector_class->submit_new_framebuffer = submit_new_framebuffer;
  detector_class->is_region_damaged = is_region_damaged;
  detector_class->get_damage_region = get_damage_region;
}