#include "boxm2_ocl_render_expected_image_function.h"
//
#include <vul/vul_timer.h>
#include <boxm2/ocl/boxm2_ocl_util.h>
#include <boxm2/ocl/algo/boxm2_ocl_camera_converter.h>
#include <vsph/vsph_camera_bounds.h>
#include <vgl/vgl_ray_3d.h>
#include <boct/boct_bit_tree.h>

#include <brad/brad_image_metadata.h>
#include <brad/brad_atmospheric_parameters.h>
#include <brad/brad_illum_util.h>

float render_expected_image(  boxm2_scene_sptr & scene,
                              bocl_device_sptr & device,
                              boxm2_opencl_cache_sptr & opencl_cache,
                              cl_command_queue & queue,
                              vpgl_camera_double_sptr & cam,
                              bocl_mem_sptr & exp_image,
                              bocl_mem_sptr & vis_image,
                              bocl_mem_sptr & exp_img_dim,
                              vcl_string data_type,
                              bocl_kernel* kernel,
                              vcl_size_t * lthreads,
                              unsigned cl_ni,
                              unsigned cl_nj,
                              int apptypesize )
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" &&
        cam->type_name()!= "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    // create all buffers
    cl_float* ray_origins = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins, "ray_origins buffer");
    bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions, "ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);

    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        vcl_cout<<(*id);
        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;

        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(*id);
        int alphaTypeSize   = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
        // data type string may contain an identifier so determine the buffer size
        bocl_mem* mog       = opencl_cache->get_data(*id,data_type,alpha->num_bytes()/alphaTypeSize*apptypesize,true);
        transfer_time += (float) transfer.all();

        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( mog );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );

        //local tree , cumsum buffer, imindex buffer
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_int) );

        //execute kernel
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        gpu_time += kern->exec_time();

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    //clean up cam
    delete[] ray_origins;
    delete[] ray_directions;
    opencl_cache->unref_mem(ray_o_buff.ptr());
    opencl_cache->unref_mem(ray_d_buff.ptr());

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    return gpu_time + transfer_time;
}


//render_cone_expected_image - pretty much the same as above but one of the local
//memory arguments is expanded
float render_cone_expected_image( boxm2_scene_sptr & scene,
                                  bocl_device_sptr & device,
                                  boxm2_opencl_cache_sptr & opencl_cache,
                                  cl_command_queue & queue,
                                  vpgl_camera_double_sptr & cam,
                                  bocl_mem_sptr & exp_image,
                                  bocl_mem_sptr & vis_image,
                                  bocl_mem_sptr & ray_level_image,
                                  bocl_mem_sptr & exp_img_dim,
                                  vcl_string data_type,
                                  bocl_kernel* kernel,
                                  vcl_size_t * lthreads,
                                  unsigned cl_ni,
                                  unsigned cl_nj )
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" && cam->type_name() != "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    //set generic cam and get visible block order
    cl_float* ray_origins = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = new bocl_mem(device->context(), ray_origins, cl_ni*cl_nj * sizeof(cl_float4) , "ray_origins buffer");
    bocl_mem_sptr ray_d_buff = new bocl_mem(device->context(), ray_directions,  cl_ni*cl_nj * sizeof(cl_float4), "ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);
    ray_d_buff->read_to_buffer(queue);

    ////////////////////////////////////////////////////////////////////////////////
    //gotta do this the old fashion way for debuggin....
    vcl_cout<<"  DEBUG: COMPUTING CONE HALF ANGLES ON CPU"<<vcl_endl;
    int cnt = 0;
    for (unsigned j=0;j<cl_nj;++j) {
      for (unsigned i=0;i<cl_ni;++i) {
        //calculate ray and ray angles at pixel ij
        vgl_ray_3d<double> ray_ij; //= cam->ray(i,j);
        double cone_half_angle, solid_angle;
        vpgl_perspective_camera<double>* pcam = (vpgl_perspective_camera<double>*) cam.ptr();
        vsph_camera_bounds::pixel_solid_angle(*pcam, i, j, ray_ij, cone_half_angle, solid_angle);
        ray_directions[4*cnt+3] = (cl_float) cone_half_angle;
        cnt++;
      }
    }
    ray_d_buff->write_to_buffer(queue);
    vcl_cout<<"opencl Half angle: "
            <<ray_directions[0]<<','
            <<ray_directions[1]<<','
            <<ray_directions[2]<<','
            <<ray_directions[3]<<'\n'
            <<"  DEBUG: FINISHED CONE HALF ANGLES ON CPU"<<vcl_endl;
    ////////////////////////////////////////////////////////////////////////////////


    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //center buffer
    bocl_mem_sptr centerX = new bocl_mem(device->context(), boct_bit_tree::centerX, sizeof(cl_float)*585, "centersX lookup buffer");
    bocl_mem_sptr centerY = new bocl_mem(device->context(), boct_bit_tree::centerY, sizeof(cl_float)*585, "centersY lookup buffer");
    bocl_mem_sptr centerZ = new bocl_mem(device->context(), boct_bit_tree::centerZ, sizeof(cl_float)*585, "centersZ lookup buffer");
    centerX->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
    centerY->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
    centerZ->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;

        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_GAMMA>(*id); //, numCells*gammaTypeSize); //opencl_cache->get_data<BOXM2_GAMMA>(*id);
        bocl_mem* mog       = opencl_cache->get_data(*id,data_type);
        transfer_time += (float) transfer.all();

        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( mog );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );
        kern->set_arg( ray_level_image.ptr() );

        //set centers args
        kern->set_arg( centerX.ptr() );
        kern->set_arg( centerY.ptr() );
        kern->set_arg( centerZ.ptr() );

        //local tree , cumsum buffer, tree list
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*73*sizeof(cl_uchar) ); //to visit lists

        //execute kernel
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        gpu_time += kern->exec_time();

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    //clean up cam
    delete[] ray_origins;
    delete[] ray_directions;

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    return gpu_time + transfer_time;
}

float render_expected_shadow_map(boxm2_scene_sptr & scene,
                                 bocl_device_sptr & device,
                                 boxm2_opencl_cache_sptr & opencl_cache,
                                 cl_command_queue & queue,
                                 vpgl_camera_double_sptr & cam,
                                 bocl_mem_sptr & exp_image,
                                 bocl_mem_sptr & vis_image,
                                 bocl_mem_sptr & exp_img_dim,
                                 vcl_string data_type,
                                 bocl_kernel* kernel,
                                 vcl_size_t * lthreads,
                                 unsigned cl_ni,
                                 unsigned cl_nj )
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" && cam->type_name() != "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    //set generic cam and get visible block order
    cl_float* ray_origins    = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_origins,"ray_origins buffer");
    bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj*sizeof(cl_float4), ray_directions,"ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);

    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;
        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(*id);
        bocl_mem *aux_sun   = opencl_cache->get_data(*id, boxm2_data_traits<BOXM2_AUX0>::prefix(data_type));
        //vcl_cout << "id = " << id << vcl_endl;
        vcl_cout << "blk = " << blk->cpu_buffer() << vcl_endl;
        vcl_cout << "alpha = " << alpha->cpu_buffer() << vcl_endl;
        vcl_cout << "aux_sun = " << aux_sun->cpu_buffer() << vcl_endl;

        transfer_time += (float) transfer.all();
        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( aux_sun );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );

        //local tree , cumsum buffer, imindex buffer
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_int) );

        //execute kernel
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        gpu_time += kern->exec_time();

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    opencl_cache->unref_mem(ray_d_buff.ptr());
    opencl_cache->unref_mem(ray_o_buff.ptr());

    //clean up cam
    delete[] ray_origins;
    delete[] ray_directions;

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    return gpu_time + transfer_time;
}

float render_expected_phongs_image( boxm2_scene_sptr & scene,
                                    bocl_device_sptr & device,
                                    boxm2_opencl_cache_sptr & opencl_cache,
                                    cl_command_queue & queue,
                                    vpgl_camera_double_sptr & cam,
                                    bocl_mem_sptr & exp_image,
                                    bocl_mem_sptr & vis_image,
                                    bocl_mem_sptr & exp_img_dim,
                                    vcl_string data_type,
                                    bocl_kernel* kernel,
                                    vcl_size_t * lthreads,
                                    unsigned cl_ni,
                                    unsigned cl_nj,
                                    bocl_mem_sptr sundir)
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" &&
        cam->type_name()!= "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    //set generic cam and get visible block order
    cl_float* ray_origins    = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = new bocl_mem(device->context(), ray_origins   ,  cl_ni*cl_nj * sizeof(cl_float4), "ray_origins buffer");
    bocl_mem_sptr ray_d_buff = new bocl_mem(device->context(), ray_directions,  cl_ni*cl_nj * sizeof(cl_float4), "ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);

    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        vcl_cout<<(*id);
        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;

        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(*id);
        bocl_mem* mog       = opencl_cache->get_data(*id,"float8_phongs_model");
        transfer_time += (float) transfer.all();

        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( mog );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( sundir.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );

        //local tree , cumsum buffer, imindex buffer
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_int) );

        //execute kernel
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        gpu_time += kern->exec_time();

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    //clean up cam
    delete[] ray_origins;
    delete[] ray_directions;

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    return gpu_time + transfer_time;
}


float render_expected_image_naa(  boxm2_scene_sptr & scene,
                                  bocl_device_sptr & device,
                                  boxm2_opencl_cache_sptr & opencl_cache,
                                  cl_command_queue & queue,
                                  vpgl_camera_double_sptr & cam,
                                  bocl_mem_sptr & exp_image,
                                  bocl_mem_sptr & vis_image,
                                  bocl_mem_sptr & exp_img_dim,
                                  bocl_kernel* kernel,
                                  vcl_size_t * lthreads,
                                  unsigned cl_ni,
                                  unsigned cl_nj,
                                  const brad_image_metadata_sptr  metadata,
                                  const brad_atmospheric_parameters_sptr atm_params)
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" &&
        cam->type_name()!= "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    //set generic cam and get visible block order
    cl_float* ray_origins    = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj * sizeof(cl_float4),ray_origins   , "ray_origins buffer");
    bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj * sizeof(cl_float4),ray_directions, "ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);

    // get normal directions
    vcl_vector<vgl_vector_3d<double> > normals = boxm2_normal_albedo_array::get_normals();
    unsigned int num_normals = normals.size();
    // opencl code depends on there being exactly 16 normal directions - do sanity check here
    if (num_normals != 16) {
      vcl_cerr << "ERROR: boxm2_ocl_update_alpha_naa_process: num_normals = " << num_normals << ".  Expected 16" << vcl_endl;
      return false;
    }

   double deg2rad = vnl_math::pi_over_180;
   double sun_az = metadata->sun_azimuth_ * deg2rad;
   double sun_el = metadata->sun_elevation_ * deg2rad;
   vgl_vector_3d<double> sun_dir(vcl_sin(sun_az)*vcl_cos(sun_el),
                                 vcl_cos(sun_az)*vcl_cos(sun_el),
                                 vcl_sin(sun_el));

   // buffers for holding radiance scales and offsets per normal
   float* radiance_scales_buff = new float[num_normals];
   float* radiance_offsets_buff = new float[num_normals];
  
   // compute offsets and scale for linear radiance model
   for (unsigned n=0; n < num_normals; ++n) {
      // compute offset as radiance of surface with 0 reflectance
      double offset = brad_expected_radiance_chavez(0.0, normals[n], *metadata, *atm_params);
      radiance_offsets_buff[n] = offset;
      // use perfect reflector to compute radiance scale
      double radiance = brad_expected_radiance_chavez(1.0, normals[n], *metadata, *atm_params);
      radiance_scales_buff[n] = radiance - offset;
#if 0
      brad_image_metadata shadow_metadata = *metadata;
      shadow_metadata.sun_irradiance_ = 0;
      double radiance_shadow = brad_expected_radiance_chavez(1.0, normals[n], shadow_metadata, *atm_params);
      radiance_shadow_scales_buff[n] = radiance_shadow - offset;
#endif
      vcl_cout << "radiance_scales["<<n<<"] = " << radiance_scales_buff[n] << vcl_endl;
   }

    bocl_mem_sptr radiance_scales = new bocl_mem(device->context(), radiance_scales_buff, sizeof(float)*num_normals,"radiance scales buffer");
    radiance_scales->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    bocl_mem_sptr radiance_offsets = new bocl_mem(device->context(), radiance_offsets_buff, sizeof(float)*num_normals,"radiance offset buffer");
    radiance_offsets->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        vcl_cout << (*id) << vcl_endl;
        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;

        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(*id);
        int alphaTypeSize   = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
        // data type string may contain an identifier so determine the buffer size
        unsigned int num_cells = alpha->num_bytes()/alphaTypeSize;
        vcl_string data_type = boxm2_data_traits<BOXM2_NORMAL_ALBEDO_ARRAY>::prefix();
        int appTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_NORMAL_ALBEDO_ARRAY>::prefix());

        bocl_mem* naa_apm  = opencl_cache->get_data(*id,data_type,num_cells*appTypeSize, true);
        transfer_time += (float) transfer.all();

        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( naa_apm );
        kern->set_arg( radiance_scales.ptr() );
        kern->set_arg( radiance_offsets.ptr() );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );

        //local tree , cumsum buffer, imindex buffer
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_int) );

        //execute kernel
        vcl_cout << "executing kernel.." << vcl_endl;
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        float kern_time = kern->exec_time();
        vcl_cout << "..exec_time = " << kern_time << vcl_endl;
        gpu_time += kern_time;

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    //clean up cam
    opencl_cache->unref_mem(ray_o_buff.ptr());
    opencl_cache->unref_mem(ray_d_buff.ptr());

    delete[] ray_origins;
    delete[] ray_directions;
    delete[] radiance_scales_buff;
    //delete[] radiance_scales_shadow_buff;
    delete[] radiance_offsets_buff;

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    vcl_cout << "Returning " << vcl_endl;
    return gpu_time + transfer_time;
}

float render_expected_albedo_normal( boxm2_scene_sptr & scene,
                                     bocl_device_sptr & device,
                                     boxm2_opencl_cache_sptr & opencl_cache,
                                     cl_command_queue & queue,
                                     vpgl_camera_double_sptr & cam,
                                     bocl_mem_sptr & exp_image,
                                     bocl_mem_sptr & vis_image,
                                     bocl_mem_sptr & exp_img_dim,
                                     bocl_kernel* kernel,
                                     vcl_size_t * lthreads,
                                     unsigned cl_ni,
                                     unsigned cl_nj)
{
    float transfer_time=0.0f;
    float gpu_time=0.0f;

    //camera check
    if (cam->type_name()!= "vpgl_perspective_camera" &&
        cam->type_name()!= "vpgl_generic_camera" ) {
      vcl_cout<<"Cannot render with camera of type "<<cam->type_name()<<vcl_endl;
      return 0.0f;
    }

    //set generic cam and get visible block order
    cl_float* ray_origins    = new cl_float[4*cl_ni*cl_nj];
    cl_float* ray_directions = new cl_float[4*cl_ni*cl_nj];
    bocl_mem_sptr ray_o_buff = opencl_cache->alloc_mem(cl_ni*cl_nj * sizeof(cl_float4),ray_origins   , "ray_origins buffer");
    bocl_mem_sptr ray_d_buff = opencl_cache->alloc_mem(cl_ni*cl_nj * sizeof(cl_float4),ray_directions, "ray_directions buffer");
    boxm2_ocl_camera_converter::compute_ray_image( device, queue, cam, cl_ni, cl_nj, ray_o_buff, ray_d_buff);

    // get normals
    vcl_vector<vgl_vector_3d<double> > normals = boxm2_normal_albedo_array::get_normals();
    cl_float16 normals_x, normals_y, normals_z;
    for (unsigned int i=0; i<16; ++i) {
       normals_x.s[i] = normals[i].x();
       normals_y.s[i] = normals[i].y();
       normals_z.s[i] = normals[i].z();
    }
    bocl_mem_sptr normals_x_buff = new bocl_mem(device->context(), &normals_x, sizeof(cl_float16), "normals_x buffer");
    normals_x_buff->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    bocl_mem_sptr normals_y_buff = new bocl_mem(device->context(), &normals_y, sizeof(cl_float16), "normals_y buffer");
    normals_y_buff->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    bocl_mem_sptr normals_z_buff = new bocl_mem(device->context(), &normals_z, sizeof(cl_float16), "normals_z buffer");
    normals_z_buff->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    // Output Array
    float output_arr[100];
    for (int i=0; i<100; ++i) output_arr[i] = 0.0f;
    bocl_mem_sptr  cl_output=new bocl_mem(device->context(), output_arr, sizeof(float)*100, "output buffer");
    cl_output->create_buffer(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);

    // bit lookup buffer
    cl_uchar lookup_arr[256];
    boxm2_ocl_util::set_bit_lookup(lookup_arr);
    bocl_mem_sptr lookup=new bocl_mem(device->context(), lookup_arr, sizeof(cl_uchar)*256, "bit lookup buffer");
    lookup->create_buffer(CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);

    //2. set global thread size
    vcl_size_t gThreads[] = {cl_ni,cl_nj};

    // set arguments
    vcl_vector<boxm2_block_id> vis_order = scene->get_vis_blocks(cam);
    vcl_vector<boxm2_block_id>::iterator id;
    for (id = vis_order.begin(); id != vis_order.end(); ++id)
    {
        vcl_cout << (*id) << vcl_endl;

        //choose correct render kernel
        boxm2_block_metadata mdata = scene->get_block_metadata(*id);
        bocl_kernel* kern =  kernel;

        //write the image values to the buffer
        vul_timer transfer;
        bocl_mem* blk       = opencl_cache->get_block(*id);
        bocl_mem* blk_info  = opencl_cache->loaded_block_info();
        bocl_mem* alpha     = opencl_cache->get_data<BOXM2_ALPHA>(*id);
        int alphaTypeSize   = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_ALPHA>::prefix());
        // data type string may contain an identifier so determine the buffer size
        unsigned int num_cells = alpha->num_bytes()/alphaTypeSize;
        vcl_string data_type = boxm2_data_traits<BOXM2_NORMAL_ALBEDO_ARRAY>::prefix();
        int appTypeSize = (int)boxm2_data_info::datasize(boxm2_data_traits<BOXM2_NORMAL_ALBEDO_ARRAY>::prefix());

        bocl_mem* naa_apm  = opencl_cache->get_data(*id,data_type,num_cells*appTypeSize, true);
        transfer_time += (float) transfer.all();

        ////3. SET args
        kern->set_arg( blk_info );
        kern->set_arg( blk );
        kern->set_arg( alpha );
        kern->set_arg( naa_apm );
        kern->set_arg( normals_x_buff.ptr() );
        kern->set_arg( normals_y_buff.ptr() );
        kern->set_arg( normals_z_buff.ptr() );
        kern->set_arg( ray_o_buff.ptr() );
        kern->set_arg( ray_d_buff.ptr() );
        kern->set_arg( exp_image.ptr() );
        kern->set_arg( exp_img_dim.ptr());
        kern->set_arg( cl_output.ptr() );
        kern->set_arg( lookup.ptr() );
        kern->set_arg( vis_image.ptr() );

        //local tree , cumsum buffer, imindex buffer
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_uchar16) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*10*sizeof(cl_uchar) );
        kern->set_local_arg( lthreads[0]*lthreads[1]*sizeof(cl_int) );

        //execute kernel
        vcl_cout << "executing kernel.." << vcl_endl;
        kern->execute(queue, 2, lthreads, gThreads);
        clFinish(queue);
        float kern_time = kern->exec_time();
        vcl_cout << "..exec_time = " << kern_time << vcl_endl;
        gpu_time += kern_time;

        //clear render kernel args so it can reset em on next execution
        kern->clear_args();
    }

    //clean up cam
    opencl_cache->unref_mem(ray_o_buff.ptr());
    opencl_cache->unref_mem(ray_d_buff.ptr());

    delete[] ray_origins;
    delete[] ray_directions;

    vcl_cout<<"Gpu time "<<gpu_time<<" transfer time "<<transfer_time<<vcl_endl;
    return gpu_time + transfer_time;
}

