/* israndom randomness test using information theory and data compressors.
 * based on Shannon entropy and Kolmogorov Complexity
 * by Rudi Cilibrasi <cilibrar@cilibrar.com>
 */
#include <complearn.h>
#include <string.h>
#include <assert.h>
#include <math.h>

#define DEFAULT_SAMPLESIZE (6*65536)
#define DEFAULT_BLOCKSIZE 128

struct IsRandomOptions {
  gboolean is_quiet, ignore_newline, ignore_cr;
  guint32 alphasize;
  guint32 samplesize;
  GString *filename;
  char *compressor;
};

static void print_boolean(FILE *fp, guint32 val) {
  fprintf(fp, val ? "true" : "false");
}
static void print_options(FILE *fp, const struct IsRandomOptions *o) {
  if (o->ignore_newline || o->ignore_cr) {
    fprintf(fp, "(ignoring");
    if (o->ignore_newline)
      fprintf(fp, " newline");
    fprintf(fp, " ");
    if (o->ignore_newline && o->ignore_cr)
      fprintf(fp, "and ");
    if (o->ignore_cr)
      fprintf(fp, "carriage return");
    fprintf(fp, ")\n");
  }
  fprintf(fp, "alphabet size             : %d\n", o->alphasize);
  fprintf(fp, "compressor                : %s\n", o->compressor);
  fprintf(fp, "sample   size             : %d\n", o->samplesize);
  fprintf(fp, "filename                  : %s\n", o->filename->str);
}

static struct IsRandomOptions default_options(void) {
  struct IsRandomOptions iro;
  iro.is_quiet = 0;
  iro.ignore_newline = 0;
  iro.ignore_cr = 0;
  iro.samplesize = DEFAULT_SAMPLESIZE;
  iro.filename = g_string_new("-");
  iro.compressor = strdup("bzlib");
  return iro;
}

static GString *read_from_file(FILE *fp) {
  GString *result = NULL;
  static int gotshortread;
  char buf[DEFAULT_BLOCKSIZE];
  int rlen;
  if (gotshortread)
    return result;
  rlen = fread(buf, 1, DEFAULT_BLOCKSIZE, fp);
  if (rlen > 0)
    result = g_string_new_len(buf, rlen);
  if (rlen < DEFAULT_BLOCKSIZE)
    gotshortread = 1;
  return result;
}

static void printHelp(void) {
  fprintf(stderr, ""
"\n"
"israndom randomness tester by Rudi Cilibrasi <cilibrar@cilibrar.com>\n"
"\n"
"    randomness  testing using data compressors over fixed-size alphabets\n"
"\n"
"SYNOPSIS\n"
"\n"
" israndom [-a alphasize] [-c compressor] [-s samplelen]  [-qhnr] [filename]\n"
"\n"
"DESCRIPTION\n"
"      israndom tests a sequence of symbols for randomness.  israndom tries to\n"
"      determine if a given sequence of trials could reasonably be assumed  to\n"
"      be  from  a  random  uniform distribution over a fixed-size alphabet of\n"
"      2-256 symbols.\n"
"\n"
"      israndom assumes that each sequence (or sample trial) is represented by\n"
"      exactly  one byte.  The only exceptions to this rule are in the case of\n"
"      the\n"
"             -n and -r options which ignore newlines  and  carriage  returns,\n"
"             respectively (see below).\n"
"\n"
"      israndom is based on the mathematical ideas of Shannon, Kolmogorov, and\n"
"      Cilibrasi and uses the following formula to determine an expected  size\n"
"      for a sample of\n"
"             k  trials  of  a  uniform distribution over an alphasize- symbol\n"
"             alphabet.  Each symbol takes log(alphasize) bits, so  the  total\n"
"             cost (in bits) c for the ensemble of samples is k log(alphasize)\n"
"             bits.  This number  is  rounded  up  to  the  nearest  byte  and\n"
"             increased by one to arrive at the final estimate of the expected\n"
"             communication cost on the assumption of uniform randomness.\n"
"\n"
"      If the compressed size of\n"
"             k samples is less than c then this represents a randomness\n"
"             deficiency and the randomness test fails.  israndom will exit\n"
"             with nonzero status.  If israndom indicates  that  a  source  is\n"
"             nonrandom,  this  fact is effectively certain if the compression\n"
"             module is correct and invertable.  If the compressed size is  at\n"
"             least  the threshhold value c then the file appears to be random\n"
"             and passes the test and israndom  will  exit  with  a  0  return\n"
"             value.   In  either  case,  it  will  print  the  alphabet size,\n"
"             expected compressed size, sample count, and  randomness  differ‐\n"
"             ence before exitting with an appropriate return code.\n"
"\n"
"      The  default number of samples is 393216.  Although larger sizes should\n"
"      increase accuracy, using too few samples will cause the method to  fail\n"
"      to be able to resolve randomness in certain situations.  This is a the‐\n"
"      oretically unavoidable fact for all effective randomness tests.\n"
"\n"
"      If  a filename is given, it is read to find the samples to analyze.  If\n"
"      the filename \"-\" is given, or no filename is given at all, then  isran‐\n"
"      dom reads from standard input.\n"
"\n"
"      If text files are to be used, it is important to specify one or both of\n"
"      -n and -r since without these, end of line characters will be misinter‐\n"
"      preted as samples.\n"
"\n"
"OPTIONS\n"
"      -c compressor_name\n"
"      -n     ignore newlines (so that text files may be used)\n"
"      -r     ignore carriage returns (so that text files may be used)\n"
"      -a alphasize\n"
"      -s samplecount\n"
"      -q     quiet mode, with no extra status messages\n"
"      -h     print help and exit.\n"
"\n"
"\n"
"      EXAMPLES\n"
"             First, we can verify that the cryptographicly strong random num‐\n"
"             ber generator is correct:\n"
"\n"
"      israndom /dev/urandom\n"
"\n"
"      Next,  we  can  notice that the \"od\" command, without extra options, is\n"
"      not random because it prints  out  addresses  and  spaces  predictably.\n"
"      Most compressors can tell by the regular spaces that it is not random:\n"
"\n"
"      od /dev/urandom | israndom -n -r\n"
"\n"
"      but  if  we  remove  spaces using ’tr’ then a more powerful compressor,\n"
"      lzmax, is required to demonstrate the non-randomness of the sequence:\n"
"\n"
"      od /dev/urandom | tr -d ' ' | israndom -n -r -c lzmax\n"
"\n"
"      Removing the address lines using an\n"
"             od  option  yields  the  expected  result  once  again  that the\n"
"             sequence is effectively random:\n"
"\n"
"      od -An /dev/urandom | tr -d ' ' | israndom -n -r -c lzmax\n"
);
}

guint32 process_options(struct IsRandomOptions *iro, const char *optcur, const char *optnext) {
  if (strcmp("-q", optcur) == 0 || strcmp("--quiet", optcur) == 0)
    { iro->is_quiet = 1; return 1; }
  if (strcmp("-h", optcur) == 0 || strcmp("--help", optcur) == 0)
    { printHelp(); exit(0); }
  if (strcmp("-n", optcur) == 0 || strcmp("--ignore-newlines", optcur) == 0)
    { iro->ignore_newline = 1; return 1; }
  if (strcmp("-r", optcur) == 0 || strcmp("--ignore-crs", optcur) == 0)
    { iro->ignore_cr = 1; return 1; }
  if (strcmp("-s", optcur) == 0 && optnext != NULL)
    { iro->samplesize = atoi(optnext); return 2; }
  if (strcmp("-a", optcur) == 0 && optnext != NULL)
    { iro->alphasize = atoi(optnext); return 2; }
  if (strcmp("-c", optcur) == 0 && optnext != NULL)
    { iro->compressor = strdup(optnext); return 2; }
  return 0;
}

static int count_unique(GString *str)
{
  char buf[256];
  unsigned char *uc;
  int i, acc = 0;
  for (i = 0; i < 256; i += 1)
    buf[i] = 0;
  for (i = 0; i < str->len; i += 1) {
    uc = (unsigned char *) (str->str);
    buf[uc[i]] = 1;
  }
  for (i = 0; i < 256; i += 1)
    acc += buf[i];
  return acc;
}

void indicate_uncertain(struct IsRandomOptions *iro)
{
  if (iro->is_quiet == 0)
    fprintf(stderr, "inconclusive; please try a larger sample\n");
  exit(1);
}

void indicate_random(struct IsRandomOptions *iro)
{
  if (iro->is_quiet == 0)
    fprintf(stderr, "random\n");
  exit(0);
}

void indicate_nonrandom(struct IsRandomOptions *iro)
{
  if (iro->is_quiet == 0)
    fprintf(stderr, "not random\n");
  exit(1);
}

int main(int argc, char **argv) {
  FILE *infile = NULL;
  int i;
  g_type_init();
  struct IsRandomOptions iro = default_options();
  guint32 got_filename = 0;
  for (i = 1; argv[i]; i += 1) {
    int opc;
    opc = process_options(&iro, argv[i], argv[i+1]);
    if (opc == 0) {
      if (got_filename) {
        fprintf(stderr, "Error, already got filename %s; %s is extra.\n", iro.filename->str, argv[i]);
        exit(1);
      }
      iro.filename = g_string_new(argv[i]);
      got_filename = 1;
      i += 1;
    } else {
        if (opc < 0) {
          fprintf(stderr, "Option error on %s\n", argv[i]);
          exit(1);
        }
      }
      i += opc - 1;
    }
  assert(iro.alphasize >= 0);
  if (!iro.is_quiet)
    print_options(stdout, &iro);
  if (iro.filename->len == 1 && iro.filename->str[0] == '-')
    infile = stdin;
  else {
    infile = fopen(iro.filename->str, "rb");
    if (infile == NULL) {
      fprintf(stderr, "Error, cannot open file %s\n", iro.filename->str);
      exit(1);
    }
  }
  GString *acc = g_string_new("");
  while (iro.samplesize == 0 || acc->len < iro.samplesize) {
    GString *cur = read_from_file(infile);
    int i;
    if (cur == NULL)
      break;
    for (i = 0; i < cur->len && acc->len < iro.samplesize; i += 1) {
      char c = cur->str[i];
      if (c == '\r' && iro.ignore_cr)
        continue;
      if (c == '\n' && iro.ignore_newline)
        continue;
      g_string_append_c(acc, c);
    }
  }
  if (iro.samplesize == 0)
    iro.samplesize = acc->len;
  if (iro.samplesize < 128) {
    fprintf(stderr, "Error, must have at least 128 samples to analyze.\n");
    exit(1);
  }
  if (acc->len < iro.samplesize) {
    fprintf(stderr, "Error, only read %d samples before EOF.\n", acc->len);
    fprintf(stderr, "%d samples were necessary.\n", iro.samplesize);
    exit(1);
  }
  if (iro.is_quiet == 0)
    printf("\n... read %d samples...\n\n", acc->len);
  int histocount = count_unique(acc);
  if (iro.is_quiet == 0)
    printf("Emprical alphabet size : %d\n", histocount);
  if (histocount < 2) {
    indicate_nonrandom(&iro);
  }
  int effective_a;
  if (iro.alphasize == 0)
    effective_a = histocount;
  else
    effective_a = iro.alphasize;
  double bitspersym = log(effective_a)/log(2);
  if (iro.is_quiet == 0)
    printf("bits per symbol        : %f\n", bitspersym);
  double targetlength = bitspersym * iro.samplesize;
  double roundedtarget = ((int) (targetlength+15)/8)*8;
  if (iro.is_quiet == 0) {
    printf("total ideal length     : %f\n",  targetlength);
    printf("rounded ideal length   : %f\n", roundedtarget);
  }
  CompLearnRealCompressor *rc = complearn_environment_load_compressor_named(iro.compressor);
  if (rc == 0) {
    fprintf(stderr, "Error, cannot load compressor %s.\n", iro.compressor);
    exit(1);
  }
  double clen = real_compressor_compressed_size(rc, acc);
  if (iro.is_quiet == 0)
    printf("compressed size        : %f\n", clen);
  double rdd = clen - roundedtarget;
  if (iro.is_quiet == 0)
    printf("randomness difference  : %f\n\n", rdd);
  if (rdd > 0)
    indicate_random(&iro);
  if (rdd < 0)
    indicate_nonrandom(&iro);
  indicate_uncertain(&iro);
  return 0;
}
