/*
    given a dataset in arff format, create a random split 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <time.h>
#include <assert.h>

#define min(a,b)	(((a)<(b))?(a):(b))
#define max(a,b)	(((a)>(b))?(a):(b))

#define MAXSTR 20000
#define maxlength 516

#define NDEBUG
#define MAXLEN 5000

int random_uniform_int(int min, int max) {
  assert(min <= max);
  return((lrand48() >> 5) % (max - min + 1) + min);
}

FILE *train_ptr, *test_ptr;

int k;

char *sgets( char * str, int size, char ** stream ) {	
    int c,i;

    for ( i = 0; i < size-1; i++ ) {	
 	c = **stream; (*stream)++;
	if ( '\0' == c )
		return NULL;
	str[i]=c;
	if ( ('\n' == c) || ('\r' == c) ) {	
		break;
	}
    }
    str[i] = '\0';

    return str;
}

char *stoupper (char *s) {
     int i;
     char *S = (char *)malloc(strlen(s)+1);
     for(i=0;i<strlen(s);i++) {
	S[i]=toupper(s[i]);
     }
     S[i]='\0';
     return S;
}

int sisspace(char *s) {
    while((*s)!='\0') {
        if (!isspace(*s)) return 0;
        s++;
    }
    return 1;
}

void parse_header (char *src, int *attributes, int *n) {
    char line[MAXLEN];

    *attributes = 0;

    while (sgets(line,MAXLEN,&src)) {
	if (sisspace(line) || line[0]=='%') { continue; }
	fprintf(train_ptr,"%s\n",line);
	fprintf(test_ptr,"%s\n",line);
	if (strstr(stoupper(line),"@ATTRIBUTE")) {
	    (*attributes)++;
	}
        if (strcasecmp(stoupper(line),"@DATA"))
            continue;
        else
            break;
    }
    *n = 0;
    while (sgets(line,MAXLEN,&src)) { 
	if (sisspace(line) || line[0]=='%') { continue; }
	(*n)++;
    }
}

int islspace(char *line) {
    int i, n = strlen(line);
    for(i=0; i<n; i++) if (!isspace(line[i])) return 0;
    return 1;
}

int main(int argc, char *argv[]) {
    char *src;
    struct stat statbuf;

    int i, j, y;
    int fp, n;
    char line[MAXLEN], *tok;
    float arg;

    srand48(time(NULL));

    if (argc < 3) {
	fprintf(stderr,"Usage: %s S FILE\n",argv[0]);
	fprintf(stderr,"If S is in (0,1), it is treated as percentage\n");
	exit(-1);
    }

    arg = atof(argv[1]); assert(arg >= 0);

    if ((fp = open(argv[2],O_RDONLY)) < 0) {
        fprintf(stderr,"Can't open input file %s\n",argv[2]);
        exit(-1);
    }

    time_t tm;
    (void) time(&tm);

    /* size of the input file */
    if (fstat (fp,&statbuf) < 0) {
        fprintf(stderr,"fstat error"); exit(1);
    }

    /* mmap the input file */
    if ((src = (char *) mmap (0, statbuf.st_size, PROT_READ, MAP_SHARED, fp, 0))
   	== (caddr_t) -1) {
   	fprintf(stderr,"mmap error for input"); exit(1);
    }

    int attributes;

    char name[80];
    strcpy((char *)name,argv[2]);
    char *s = strrchr(name,'.');
    *s = '\0';
 
    sprintf(line,"%s.data",name);
    if ((train_ptr = fopen(line,"w")) == NULL) {
        fprintf(stderr,"Can't open input file %s\n",argv[2]);
        exit(-1);
    }
    sprintf(line,"%s.test",name);
    if ((test_ptr = fopen(line,"w")) == NULL) {
        fprintf(stderr,"Can't open input file %s\n",argv[2]);
        exit(-1);
    }

    parse_header(src,&attributes,&n);

    int *rare = malloc(sizeof(int *)*MAXLEN);
    for(i=0; i<MAXLEN; i++) rare[i] = 0;

    if (argc > 3) for(i=0; i<argc-3; i++) {
	rare[atoi(argv[i+3])] = 1;
	printf("rare %d\n",atoi(argv[i+3]));
    }

    if (arg < 1) k = (int)(arg*n); 
    else k = (int)arg;

    printf("Examples for training %d, Total %d, Attributes %d\n",
	k,n,attributes);

    int *is_test;
    is_test = (int *)malloc(sizeof(int)*n);
    for(i = 0; i < n; i++) is_test[i] = 0;

    for(i = 0; i < k; i++) {
	do {
	    j = random_uniform_int(0, n-1);
	} while (is_test[j]);
	is_test[j] = 1;
    }

    i = 0;
    while (sgets(line,MAXLEN,&src)) {
        if (sisspace(line) || line[0]=='%' || line[0]=='@') { continue; }
	tok = strrchr(line,','); tok++;
        y = atoi(tok); /* true class */

	if (is_test[i] && (!rare[y])) {
	   fprintf(test_ptr,"%s\n",line);
	}
	else {
	   fprintf(train_ptr,"%s\n",line);
	}
	i++;
    }
    assert(i = n);
    free(rare);
}
