/*
	costing takes an input file in weka format (.arff);
	the last attribute is the importance of a given example
	it outputs a set of k training sets for a binary classifier learner
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h> /* mmap() is defined in this header */
#include <fcntl.h>
#include <time.h>
#include <assert.h>

#define MAXLEN 5000

#define min(a,b)	(((a)<(b))?(a):(b))
#define max(a,b)	(((a)>(b))?(a):(b))

char *sgets( char * str, int size, char ** stream ) {	
    int c,i;

    for ( i = 0; i < size-1; i++ ) {	
 	c = **stream; (*stream)++;
	if ( '\0' == c )
		return NULL;
	str[i]=c;
	if ( ('\n' == c) || ('\r' == c) ) {	
		/* str[i++]='\n'; */
		break;
	}
    }
    str[i] = '\0';

    return str;
}

double compute_normalization (char *src, char **header) {
    double Z = 0;
    char *s = src;
    char line[MAXLEN], *tok, c[64];

    sprintf(*header,"");
    while (sgets(line,MAXLEN,&s)) {
	sprintf(*header,"%s%s\n",*header,line);
	if (strcasecmp(line,"@DATA")) 
	    continue;
	else 
	    break;
    }
    while (sgets(line,MAXLEN,&s)) {
	tok = strtok(line," ,\t");
	while (tok) {
	    strcpy(c,tok);
	    tok = strtok(NULL," ,\t");
	}
	Z = max(atof(c),Z);
    }
    return Z;
}


int main (int argc, char *argv[]) {
    int fp; 
    FILE **fpw;
    int i;
    char train[64];
    char *src, *header, *s;
    struct stat statbuf;
    double Z = 1;

    time_t t;
    (void) time(&t);
    srand(t);

    if (argc<4) {
	fprintf(stderr,
	    "Usage: %s importance-weighted-dataset output-directory #runs\n",
	    argv[0]);
	exit(1);
    }
    if ((fp = open(argv[1],O_RDONLY)) < 0) exit(1);
    int k = atoi(argv[3]);

    fpw = (FILE **)malloc(k*sizeof(FILE *));

    /* find size of input file */
    if (fstat (fp,&statbuf) < 0) {
        fprintf(stderr,"fstat error"); exit(1);
    }

    /* mmap the input file */
    if ((src = (char *) mmap (0, statbuf.st_size, PROT_READ, MAP_SHARED, fp, 0))
   	== (caddr_t) -1) {
   	fprintf(stderr,"mmap error for input"); exit(1);
    }

    header = (char *)malloc(sizeof(char)*statbuf.st_size);
    sprintf(header,"");

    char line[MAXLEN], sample[MAXLEN], *tok, c[64], file_name[256];

    /* remove the trailing '/' from the directory name */
    /* argv[2] contains the output directory for binary training sets */
    if (argv[2][strlen(argv[2])-1]=='/') argv[2][strlen(argv[2])-1]='\0';

    /* skip header */
    while (sgets(line,MAXLEN,&src)) {
        sprintf(header,"%s%s\n",header,line);
        if (strcasecmp(line,"@DATA"))
            continue;
        else
            break;
    }
    
    /* k is the number of runs */
    for (i=0;i<k;i++) {
        sprintf(file_name,"%s/binary-%d",argv[2],i);
        if (!(fpw[i] = fopen(file_name,"w"))) {
	    fprintf(stderr,"Failed to open %s for writing\n",file_name);
	    exit(1);
	}
	/* print weka headers */
	fprintf(fpw[i],"%s",header);
    }
    free(header);

    while (sgets(line,MAXLEN,&src)) {
	strcpy(sample,line);
        tok = strtok(line," ,\t");
        while (tok) {
            strcpy(c,tok);
            tok = strtok(NULL," ,\t");
        }
	/* chop "sample" */
	char *sp = strrchr(sample,',');
	if (!sp) sp = strrchr(sample,' '); 
	if (!sp) sp = strrchr(sample,'\t');
	if (!sp) exit(1);
	(*sp)='\0';

	/* rejection sample with probability c/Z */
	for (i=0;i<k;i++) {
	    if (rand() * pow(RAND_MAX + 1.0, -1) <= atof(c)/Z) {
		fprintf(fpw[i],"%s\n",sample);
	    }
	}
    }

    for (i=0;i<k;i++) {
	close(fpw[i]);
    }
}
