/*************************************
** 
** 
** 
** modification of, n-gram entropy of a file
** 
** Modified by:
** D. Snuggs
** October 25, 2004
**
** Modification:
** The original program looked at each n-gram then discarded it, the new version
** shifts the bits so that each unique sequence is looked at.
**
** "proj7.C" (n-gram entropy of a file)
**   This program calculates the n-gram entropy of a file treated as a bit 
** stream. The user provides the filename and the value of n (due to memory 
** requirements, the program restricts n such that 0 < n <= 25). The program 
** divides the file into blocks of n bits. If the last block does not contain n 
** bits, then it is thrown away since there is no reason to assume that any 
** particular padding would be appropriate. The number of possible sequences of 
** n bits equals 2^n, so an array of type int is dynamically allocated. When the 
** blocks of n bits are processed, the array (initialized to zero) is used to 
** count the number of occurrences of each bit sequence. The total number of 
** blocks (ignoring the last block if it is incomplete) is also recorded. To 
** compute the entropy, H(x) = -Summation(p(i)logBase2(p(i))) for i = 0 to 
** [(2^n) - 1] is used. To calculate p(i), the count of each sequence is divided 
** by the total number of blocks.
**
** Compile: g++ proj7.C
** Usage: a.out <filename> <n>
*************************************/

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

void checkNumArguments(int argc, int expected);
int readFile(char *filename, int ngram, int *sequences);
float computeEntropy(int counts[], int numCounts, int total);

using namespace std;

int main(int argc, char *argv[])
{
   int ngram;                // value of n
   int numSeq;               // number of unique n-bit sequences
   int *sequences;           // array to hold sequence counts
   int total;                // total number of blocks
   float entropy;            // entropy
   
   /* checks command-line arguments restricts n to (0 < n <= 25) */
   checkNumArguments(argc, 3);
   ngram = atoi(argv[2]);
   if(ngram < 1){
      cerr << "The n-gram value must be greater than 0, exiting.";
      cerr << endl;
      exit (-1);
   }
   if(ngram > 25){
      cerr << "The n-gram value can not be greater than 25, exiting.";
      cerr << endl;
      exit (-1);
   }
   
   /* dynamically allocates array of length 2^n*/
   numSeq = (int) pow(2.0, ngram);
   sequences = (int*) malloc(sizeof(int) * numSeq);
   for(int i = 0; i < numSeq; i++){
      sequences[i] = 0;
   }
   
   /* reads file, processes bit blocks and computes entropy */
   total = readFile(argv[1], ngram, sequences);
   entropy = computeEntropy(sequences, numSeq, total);
   
   /* displays entropy */
   cout << ngram << "-gram entropy for " << argv[1] << " = ";
   cout << entropy << endl;
   
   /* frees dynamically allocated memory */
   free(sequences);
   
   return 0;
}

/*******************************
Function: checkNumArguments
   This function determines if the correct number of command line arguments are
provided. If not, and error message is displayed and program exits.
Inputs: expected and actual number of command line arguments
Outputs: nothing if counts match, otherwise, error message and program exits.
*******************************/
void checkNumArguments(int argc, int expected)
{
   /* if an inappropriate number of arguments exist, and error message  **
   ** is displayed and the program exits                                */
   if(argc != expected)
   {
      cerr << "This program requires " << expected << endl;
      cerr << "command line arguments." << endl;
      cerr << "Usage: <executable> <filename> <n-gram value>" << endl;
      cerr << "Exiting program." << endl;
      exit (-1);
   }
}

/*******************************
Function: readFile
   This function reads the bits from a file, stores the number of each bit 
sequence in an array and returns the total number of blocks.
Inputs: filename, size of the blocks and array of sequence counts
Outputs: total number of blocks
*******************************/
int readFile(char *filename, int ngram, int *sequences)
{
   FILE *inFile;              // input file pointer
   int finished = 0;          // indicates if reading of file is complete
   int currLength;            // current length of bit block
   int *currString = (int*) malloc(sizeof(int) * ngram); // current block
   char buffChar;             // holds byte read from file
   int buffer[8];             // stores bits from byte read from file
   int buffNum = 8;           // indexes buffer[]
   int buffSize = 0;          // number of bytes read from file
   int value;                 // integer value of bit block
   int total = 0;             // total number of blocks
   int first = 1;			// indicates if the first ngram has been read
   
   /* opens and verifies file */
   inFile = fopen(filename, "r");
   if(inFile == NULL){
      cout << "Could not open file, exiting." << endl;
      exit(-2);
   }
   
   /* reads file and fills blocks of n bits */
   while(finished == 0)
	{
      currLength = 0;
      
      /* fills a block of n bits */
      for(int i = 0; (i < ngram) && (finished == 0) && (first == 1); i++)
		{
         /* refills buffer if all bits have been used */
         if(buffNum >= 8){
            buffNum = 0;
            buffSize = fread(&buffChar, sizeof(buffChar), 1, inFile);
            
            /* if at end of file, indicate that reading is finished **
            ** else, split buffer byte into individual bits         */
            if(buffSize < sizeof(buffChar))
				{
               finished = 1;
            }
				else
				{
               for(int j = 0; j < 8; j++)
					{
                  buffer[7 - j] = (int) (((unsigned) buffChar) % 2);
                  buffChar = (char) (((unsigned) buffChar) / 2);
               }
            }
         }//end if
         /* if not finished, read a bit into the current string (block) **
         ** and increment indices                                       */
         if(finished == 0){
            currString[i] = buffer[buffNum];
            buffNum++;
            currLength++;
         }
			
      }//end for
	// both ngrams do this
	first = 0;
      /* if block is filled, increment the counts */
      if(currLength == ngram){
         value = 0;
         for(int i = 0; i < ngram; i++){
            value = (2 * value) + currString[i];
         }
         sequences[value]++;
         total++;
      }

	//modification begins here
	while((finished == 0) && (first == 0))
	{
	/* refills buffer if all bits have been used */
		if(buffNum >= 8)
		{
			buffNum = 0;
			buffSize = fread(&buffChar, sizeof(buffChar), 1, inFile);
            
			/* if at end of file, indicate that reading is finished **
			** else, split buffer byte into individual bits         */
			if(buffSize < sizeof(buffChar))
			{
				finished = 1;
			}else
			{
				for(int j = 0; j < 8; j++)
				{
					buffer[7 - j] = (int) (((unsigned) buffChar) % 2);
					buffChar = (char) (((unsigned) buffChar) / 2);
				}
			}
		}// end if
		/* if not finished, read a bit into the current string (block) **
		** and increment indices                                       */
		//shift current string and increment indices
		if(finished == 0)
		{
			for (int k = 1; k < ngram; k++)
			{
				currString[k-1] = currString[k];
			}
			currString[ngram-1] = buffer[buffNum];
			buffNum++;
		}
			 // both ngrams do this
	 	first = 0;
      /* if block is filled, increment the counts */
      if(currLength == ngram and finished ==0){
         value = 0;
         for(int i = 0; i < ngram; i++){
            value = (2 * value) + currString[i];
         }
         sequences[value]++;
         total++;
      }
	}//end inner while

   }//end outer while
   
   return total;
}

/*******************************
Function: computeEntropy
   This function computes entropy for an array containing the counts with a 
given total.
Inputs: array containing counts, size of array, sum of all counts
Outputs: entropy
*******************************/
float computeEntropy(int counts[], int numCounts, int total)
{
   float sum = 0;      // running sum
   float probability;  // current probability
   
   /* computes entropy */
   for(int i = 0; i < numCounts; i++){
      if(counts[i] > 0){
         probability = ((float) counts[i]) / ((float) total);
         /* sum is "-=" because the entropy is negative the summation... */
         sum -= probability * (log10(probability) / log10(2.0));
      }
   }
   
   return sum;
}
