#!/bin/bash

if [ $# -ne 5 ] ; then
  echo "Usage: run ALG DATADIR NUMFOLDS DELTA WORKDIR"
  exit
fi
  
ALG=$1
DATADIR=$2
NUMFOLDS=$3
DELTA=$4
WORKDIR=$5
STARTDIR=`pwd`
if [ $SEMIBOUND_DIR ] ; then
  BASEDIR=$SEMIBOUND_DIR
else
  echo "SEMIBOUND_DIR not defined!"
  exit
fi

if [ ! -d $DATADIR ] ; then 
  echo "Directory $DATADIR does not exist!"
  exit
fi
cd $DATADIR
DATADIR=`pwd`
cd $STARTDIR

# copy the data to workdir
if [ -e $WORKDIR ] ; then
  echo "Directory $WORKDIR already exists!"
  exit
fi
mkdir -p $WORKDIR
cd $WORKDIR
WORKDIR=`pwd`

cp $DATADIR/* .

# cv specific stuff starts here
$BASEDIR/cv/gen_samples $NUMFOLDS

# train and test on folds.  Classify the unlabeled data
cd $BASEDIR/algs/$ALG
for i in `seq 1 $NUMFOLDS`; do
  train $WORKDIR/train_$i
  predict $WORKDIR/test_$i $WORKDIR/train_${i} $WORKDIR/test_${i}_preds
  predict $WORKDIR/unlabeled $WORKDIR/train_${i} $WORKDIR/unlabeled_${i}_preds
done;
# train on the whole data.  Classify the unlabeled data
train $WORKDIR/labeled
predict $WORKDIR/unlabeled $WORKDIR/labeled $WORKDIR/unlabeled_preds
cd $BASEDIR

for i in $WORKDIR/*_preds ; do
  algs/$ALG/labels $i > $WORKDIR/tmp
  mv $WORKDIR/tmp $i;
done;

# compute the cv bound for the randomized ensemble
for i in `seq 1 $NUMFOLDS`; do
  algs/$ALG/labels $WORKDIR/test_${i} > $WORKDIR/tmp
  errors=`errors $WORKDIR/tmp $WORKDIR/test_${i}_preds`
  cases=`lines $WORKDIR/tmp`
  error_rate=`perl -e "print $errors / $cases"`
  DELTASHARE=`perl -e "print $DELTA / (2*$NUMFOLDS)"`
  bound=`upper_bound $DELTASHARE $cases $errors`
  echo $errors >> $WORKDIR/errors
  echo $error_rate >> $WORKDIR/error_rates
  echo $bound >> $WORKDIR/bounds
  rm $WORKDIR/tmp;
done;
cv_estimate=`avg $WORKDIR/error_rates`
bound_for_rand=`avg $WORKDIR/bounds`

# compute the distance between the randomized and the final hypothesis
ensemble $WORKDIR/unlabeled_*_preds > $WORKDIR/unlabeled_ensemble
random_predict $WORKDIR/unlabeled_ensemble > $WORKDIR/unlabeled_ensemble_preds 
errors=`errors $WORKDIR/unlabeled_preds $WORKDIR/unlabeled_ensemble_preds`
cases=`lines $WORKDIR/unlabeled_preds`
DELTASHARE=`perl -e "print $DELTA / 2"`
bound_dist=`upper_bound $DELTASHARE $cases $errors`

# cheat and compute the error rate of the randomized ensemble on unlabeled data
algs/$ALG/labels $WORKDIR/unlabeled > $WORKDIR/tmp
errors=`errors $WORKDIR/tmp $WORKDIR/unlabeled_ensemble_preds`
cases=`lines $WORKDIR/tmp`
error_rate_rand=`perl -e "print $errors / $cases"`

# cheat again and compute the error rate of the final classifier on unlabeled data
algs/$ALG/labels $WORKDIR/unlabeled > $WORKDIR/tmp
errors=`errors $WORKDIR/tmp $WORKDIR/unlabeled_preds`
cases=`lines $WORKDIR/tmp`
error_rate_final=`perl -e "print $errors / $cases"`

# output the bound
perl -e "print $bound_for_rand + $bound_dist" > $WORKDIR/results
echo "" $bound_for_rand $bound_dist $error_rate_rand $error_rate_final $cv_estimate >> $WORKDIR/results
cat $WORKDIR/results
GLOBIGNORE=$WORKDIR/results
rm $WORKDIR/*
