/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include "data.h"

#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>   /* free() */

#include <sys/stat.h>

#include <curl/curl.h>

#include "mem.h"
#include "util.h"
#define XXH_STATIC_LINKING_ONLY
#include "xxhash.h"

/**
* Data objects
*/

#define REGRESSION_RELEASE(x) \
   "https://github.com/facebook/zstd/releases/download/regression-data/" x

data_t silesia = {
   .name = "silesia",
   .type = data_type_dir,
   .data =
       {
           .url = REGRESSION_RELEASE("silesia.tar.zst"),
           .xxhash64 = 0x48a199f92f93e977LL,
       },
};

data_t silesia_tar = {
   .name = "silesia.tar",
   .type = data_type_file,
   .data =
       {
           .url = REGRESSION_RELEASE("silesia.tar.zst"),
           .xxhash64 = 0x48a199f92f93e977LL,
       },
};

data_t github = {
   .name = "github",
   .type = data_type_dir,
   .data =
       {
           .url = REGRESSION_RELEASE("github.tar.zst"),
           .xxhash64 = 0xa9b1b44b020df292LL,
       },
   .dict =
       {
           .url = REGRESSION_RELEASE("github.dict.zst"),
           .xxhash64 = 0x1eddc6f737d3cb53LL,

       },
};

data_t github_tar = {
   .name = "github.tar",
   .type = data_type_file,
   .data =
       {
           .url = REGRESSION_RELEASE("github.tar.zst"),
           .xxhash64 = 0xa9b1b44b020df292LL,
       },
   .dict =
       {
           .url = REGRESSION_RELEASE("github.dict.zst"),
           .xxhash64 = 0x1eddc6f737d3cb53LL,

       },
};

static data_t* g_data[] = {
   &silesia,
   &silesia_tar,
   &github,
   &github_tar,
   NULL,
};

data_t const* const* data = (data_t const* const*)g_data;

/**
* data helpers.
*/

int data_has_dict(data_t const* data) {
   return data->dict.url != NULL;
}

/**
* data buffer helper functions (documented in header).
*/

data_buffer_t data_buffer_create(size_t const capacity) {
   data_buffer_t buffer = {};

   buffer.data = (uint8_t*)malloc(capacity);
   if (buffer.data == NULL)
       return buffer;
   buffer.capacity = capacity;
   return buffer;
}

data_buffer_t data_buffer_read(char const* filename) {
   data_buffer_t buffer = {};

   uint64_t const size = UTIL_getFileSize(filename);
   if (size == UTIL_FILESIZE_UNKNOWN) {
       fprintf(stderr, "unknown size for %s\n", filename);
       return buffer;
   }

   buffer.data = (uint8_t*)malloc(size);
   if (buffer.data == NULL) {
       fprintf(stderr, "malloc failed\n");
       return buffer;
   }
   buffer.capacity = size;

   FILE* file = fopen(filename, "rb");
   if (file == NULL) {
       fprintf(stderr, "file null\n");
       goto err;
   }
   buffer.size = fread(buffer.data, 1, buffer.capacity, file);
   fclose(file);
   if (buffer.size != buffer.capacity) {
       fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
       goto err;
   }

   return buffer;
err:
   free(buffer.data);
   memset(&buffer, 0, sizeof(buffer));
   return buffer;
}

data_buffer_t data_buffer_get_data(data_t const* data) {
   data_buffer_t const kEmptyBuffer = {};

   if (data->type != data_type_file)
       return kEmptyBuffer;

   return data_buffer_read(data->data.path);
}

data_buffer_t data_buffer_get_dict(data_t const* data) {
   data_buffer_t const kEmptyBuffer = {};

   if (!data_has_dict(data))
       return kEmptyBuffer;

   return data_buffer_read(data->dict.path);
}

int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
   size_t const size =
       buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
   int const cmp = memcmp(buffer1.data, buffer2.data, size);
   if (cmp != 0)
       return cmp;
   if (buffer1.size < buffer2.size)
       return -1;
   if (buffer1.size == buffer2.size)
       return 0;
   assert(buffer1.size > buffer2.size);
   return 1;
}

void data_buffer_free(data_buffer_t buffer) {
   free(buffer.data);
}

/**
* data filenames helpers.
*/

FileNamesTable* data_filenames_get(data_t const* data)
{
   char const* const path = data->data.path;
   return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
}

/**
* data buffers helpers.
*/

data_buffers_t data_buffers_get(data_t const* data) {
   data_buffers_t buffers = {.size = 0};
   FileNamesTable* const filenames = data_filenames_get(data);
   if (filenames == NULL) return buffers;
   if (filenames->tableSize == 0) {
       UTIL_freeFileNamesTable(filenames);
       return buffers;
   }

   data_buffer_t* buffersPtr =
       (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
   if (buffersPtr == NULL) {
       UTIL_freeFileNamesTable(filenames);
       return buffers;
   }
   buffers.buffers = (data_buffer_t const*)buffersPtr;
   buffers.size = filenames->tableSize;

   for (size_t i = 0; i < filenames->tableSize; ++i) {
       buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
       if (buffersPtr[i].data == NULL) {
           data_buffers_t const kEmptyBuffer = {};
           data_buffers_free(buffers);
           UTIL_freeFileNamesTable(filenames);
           return kEmptyBuffer;
       }
   }

   UTIL_freeFileNamesTable(filenames);
   return buffers;
}

/**
* Frees the data buffers.
*/
void data_buffers_free(data_buffers_t buffers) {
   free((data_buffer_t*)buffers.buffers);
}

/**
* Initialization and download functions.
*/

static char* g_data_dir = NULL;

/* mkdir -p */
static int ensure_directory_exists(char const* indir) {
   char* const dir = strdup(indir);
   char* end = dir;
   int ret = 0;
   if (dir == NULL) {
       ret = EINVAL;
       goto out;
   }
   do {
       /* Find the next directory level. */
       for (++end; *end != '\0' && *end != '/'; ++end)
           ;
       /* End the string there, make the directory, and restore the string. */
       char const save = *end;
       *end = '\0';
       int const isdir = UTIL_isDirectory(dir);
       ret = mkdir(dir, S_IRWXU);
       *end = save;
       /* Its okay if the directory already exists. */
       if (ret == 0 || (errno == EEXIST && isdir))
           continue;
       ret = errno;
       fprintf(stderr, "mkdir() failed\n");
       goto out;
   } while (*end != '\0');

   ret = 0;
out:
   free(dir);
   return ret;
}

/** Concatenate 3 strings into a new buffer. */
static char* cat3(char const* str1, char const* str2, char const* str3) {
   size_t const size1 = strlen(str1);
   size_t const size2 = strlen(str2);
   size_t const size3 = str3 == NULL ? 0 : strlen(str3);
   size_t const size = size1 + size2 + size3 + 1;
   char* const dst = (char*)malloc(size);
   if (dst == NULL)
       return NULL;
   strcpy(dst, str1);
   strcpy(dst + size1, str2);
   if (str3 != NULL)
       strcpy(dst + size1 + size2, str3);
   assert(strlen(dst) == size1 + size2 + size3);
   return dst;
}

static char* cat2(char const* str1, char const* str2) {
   return cat3(str1, str2, NULL);
}

/**
* State needed by the curl callback.
* It takes data from curl, hashes it, and writes it to the file.
*/
typedef struct {
   FILE* file;
   XXH64_state_t xxhash64;
   int error;
} curl_data_t;

/** Create the curl state. */
static curl_data_t curl_data_create(
   data_resource_t const* resource,
   data_type_t type) {
   curl_data_t cdata = {};

   XXH64_reset(&cdata.xxhash64, 0);

   assert(UTIL_isDirectory(g_data_dir));

   if (type == data_type_file) {
       /* Decompress the resource and store to the path. */
       char* cmd = cat3("zstd -dqfo '", resource->path, "'");
       if (cmd == NULL) {
           cdata.error = ENOMEM;
           return cdata;
       }
       cdata.file = popen(cmd, "w");
       free(cmd);
   } else {
       /* Decompress and extract the resource to the cache directory. */
       char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
       if (cmd == NULL) {
           cdata.error = ENOMEM;
           return cdata;
       }
       cdata.file = popen(cmd, "w");
       free(cmd);
   }
   if (cdata.file == NULL) {
       cdata.error = errno;
   }

   return cdata;
}

/** Free the curl state. */
static int curl_data_free(curl_data_t cdata) {
   return pclose(cdata.file);
}

/** curl callback. Updates the hash, and writes to the file. */
static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
   curl_data_t* cdata = (curl_data_t*)ptr;
   size_t const written = fwrite(data, size, count, cdata->file);
   XXH64_update(&cdata->xxhash64, data, written * size);
   return written;
}

static int curl_download_resource(
   CURL* curl,
   data_resource_t const* resource,
   data_type_t type) {
   curl_data_t cdata;
   /* Download the data. */
   if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
       return EINVAL;
   if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
       return EINVAL;
   cdata = curl_data_create(resource, type);
   if (cdata.error != 0)
       return cdata.error;
   int const curl_err = curl_easy_perform(curl);
   int const close_err = curl_data_free(cdata);
   if (curl_err) {
       fprintf(
           stderr,
           "downloading '%s' for '%s' failed\n",
           resource->url,
           resource->path);
       return EIO;
   }
   if (close_err) {
       fprintf(stderr, "writing data to '%s' failed\n", resource->path);
       return EIO;
   }
   /* check that the file exists. */
   if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
       fprintf(stderr, "output file '%s' does not exist\n", resource->path);
       return EIO;
   }
   if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
       fprintf(
           stderr, "output directory '%s' does not exist\n", resource->path);
       return EIO;
   }
   /* Check that the hash matches. */
   if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
       fprintf(
           stderr,
           "checksum does not match: 0x%llxLL != 0x%llxLL\n",
           (unsigned long long)XXH64_digest(&cdata.xxhash64),
           (unsigned long long)resource->xxhash64);
       return EINVAL;
   }

   return 0;
}

/** Download a single data object. */
static int curl_download_datum(CURL* curl, data_t const* data) {
   int ret;
   ret = curl_download_resource(curl, &data->data, data->type);
   if (ret != 0)
       return ret;
   if (data_has_dict(data)) {
       ret = curl_download_resource(curl, &data->dict, data_type_file);
       if (ret != 0)
           return ret;
   }
   return ret;
}

/** Download all the data. */
static int curl_download_data(data_t const* const* data) {
   if (curl_global_init(CURL_GLOBAL_ALL) != 0)
       return EFAULT;

   curl_data_t cdata = {};
   CURL* curl = curl_easy_init();
   int err = EFAULT;

   if (curl == NULL)
       return EFAULT;

   if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
       goto out;
   if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
       goto out;
   if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
       goto out;

   assert(data != NULL);
   for (; *data != NULL; ++data) {
       if (curl_download_datum(curl, *data) != 0)
           goto out;
   }

   err = 0;
out:
   curl_easy_cleanup(curl);
   curl_global_cleanup();
   return err;
}

/** Fill the path member variable of the data objects. */
static int data_create_paths(data_t* const* data, char const* dir) {
   size_t const dirlen = strlen(dir);
   assert(data != NULL);
   for (; *data != NULL; ++data) {
       data_t* const datum = *data;
       datum->data.path = cat3(dir, "/", datum->name);
       if (datum->data.path == NULL)
           return ENOMEM;
       if (data_has_dict(datum)) {
           datum->dict.path = cat2(datum->data.path, ".dict");
           if (datum->dict.path == NULL)
               return ENOMEM;
       }
   }
   return 0;
}

/** Free the path member variable of the data objects. */
static void data_free_paths(data_t* const* data) {
   assert(data != NULL);
   for (; *data != NULL; ++data) {
       data_t* datum = *data;
       free((void*)datum->data.path);
       free((void*)datum->dict.path);
       datum->data.path = NULL;
       datum->dict.path = NULL;
   }
}

static char const kStampName[] = "STAMP";

static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
   if (!MEM_isLittleEndian())
       data = MEM_swap64(data);
   XXH64_update(state, &data, sizeof(data));
}

/** Hash the data to create the stamp. */
static uint64_t stamp_hash(data_t const* const* data) {
   XXH64_state_t state;

   XXH64_reset(&state, 0);
   assert(data != NULL);
   for (; *data != NULL; ++data) {
       data_t const* datum = *data;
       /* We don't care about the URL that we fetch from. */
       /* The path is derived from the name. */
       XXH64_update(&state, datum->name, strlen(datum->name));
       xxh_update_le(&state, datum->data.xxhash64);
       xxh_update_le(&state, datum->dict.xxhash64);
       xxh_update_le(&state, datum->type);
   }
   return XXH64_digest(&state);
}

/** Check if the stamp matches the stamp in the cache directory. */
static int stamp_check(char const* dir, data_t const* const* data) {
   char* stamp = cat3(dir, "/", kStampName);
   uint64_t const expected = stamp_hash(data);
   XXH64_canonical_t actual;
   FILE* stampfile = NULL;
   int matches = 0;

   if (stamp == NULL)
       goto out;
   if (!UTIL_isRegularFile(stamp)) {
       fprintf(stderr, "stamp does not exist: recreating the data cache\n");
       goto out;
   }

   stampfile = fopen(stamp, "rb");
   if (stampfile == NULL) {
       fprintf(stderr, "could not open stamp: recreating the data cache\n");
       goto out;
   }

   size_t b;
   if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
       fprintf(stderr, "invalid stamp: recreating the data cache\n");
       goto out;
   }

   matches = (expected == XXH64_hashFromCanonical(&actual));
   if (matches)
       fprintf(stderr, "stamp matches: reusing the cached data\n");
   else
       fprintf(stderr, "stamp does not match: recreating the data cache\n");

out:
   free(stamp);
   if (stampfile != NULL)
       fclose(stampfile);
   return matches;
}

/** On success write a new stamp, on failure delete the old stamp. */
static int
stamp_write(char const* dir, data_t const* const* data, int const data_err) {
   char* stamp = cat3(dir, "/", kStampName);
   FILE* stampfile = NULL;
   int err = EIO;

   if (stamp == NULL)
       return ENOMEM;

   if (data_err != 0) {
       err = data_err;
       goto out;
   }
   XXH64_canonical_t hash;

   XXH64_canonicalFromHash(&hash, stamp_hash(data));

   stampfile = fopen(stamp, "wb");
   if (stampfile == NULL)
       goto out;
   if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
       goto out;
   err = 0;
   fprintf(stderr, "stamped new data cache\n");
out:
   if (err != 0)
       /* Ignore errors. */
       unlink(stamp);
   free(stamp);
   if (stampfile != NULL)
       fclose(stampfile);
   return err;
}

int data_init(char const* dir) {
   int err;

   if (dir == NULL)
       return EINVAL;

   /* This must be first to simplify logic. */
   err = ensure_directory_exists(dir);
   if (err != 0)
       return err;

   /* Save the cache directory. */
   g_data_dir = strdup(dir);
   if (g_data_dir == NULL)
       return ENOMEM;

   err = data_create_paths(g_data, dir);
   if (err != 0)
       return err;

   /* If the stamp matches then we are good to go.
    * This must be called before any modifications to the data cache.
    * After this point, we MUST call stamp_write() to update the STAMP,
    * since we've updated the data cache.
    */
   if (stamp_check(dir, data))
       return 0;

   err = curl_download_data(data);
   if (err != 0)
       goto out;

out:
   /* This must be last, since it must know if data_init() succeeded. */
   stamp_write(dir, data, err);
   return err;
}

void data_finish(void) {
   data_free_paths(g_data);
   free(g_data_dir);
   g_data_dir = NULL;
}