<?php
    declare(strict_types=1);

    include('griddb_php_client.php');
    include 'vendor/autoload.php';

    $factory = StoreFactory::get_default();
    $containerName = "income_ml";

    use Phpml\Dataset\CsvDataset;
    use Phpml\Dataset\ArrayDataset;
    use Phpml\FeatureExtraction\TokenCountVectorizer;
    use Phpml\Tokenization\WordTokenizer;
    use Phpml\CrossValidation\StratifiedRandomSplit;
    use Phpml\FeatureExtraction\TfIdfTransformer;
    use Phpml\Metric\Accuracy;
    use Phpml\Classification\SVC;
    use Phpml\Classification\NaiveBayes;
    use Phpml\SupportVectorMachine\Kernel;

    $update = false;

	/// Remove php memory limit
    ini_set('memory_limit', '-1');

    try{
        /// Get GridStore object
        $gridstore = $factory->get_store(array("notificationAddress" => $argv[1],
                        "notificationPort" => $argv[2],
                        "clusterName" => $argv[3],
                        "user" => $argv[4],
                        "password" => $argv[5]
                    ));
		$gridstore->get_container("containerName");
        echo("Connect to Cluster\n");

        $cur_run = 0;
        $prediction_runs = 1;
        $accuracy_test_data = array();
        $accuracy_validation_data = array();

        while($cur_run < $prediction_runs){

			/// Get the container
			$col = $gridstore->get_container($containerName);
			echo("Collection connected.\n");
			if($col == NULL){
				echo("ERROR Container not found. name=$containerName\n");
			}

			$query_limit = 0;
			$validation_query_limit = 0;
			$id_limit = 0;
			$offset_step = 0;
			echo sprintf("Querrying for %d queries, with %d validation samples and an id limit at: %d ... \n",  $query_limit, $validation_query_limit, $id_limit );
			
			$q = array();	
			$train_samples = array();
			$train_targets = array();
			$validation_samples = array();
			$validation_targets = array();
			$cur_query = 0;


			///////////////////////////////////////////////////////////////////////
			/// Exclusion of incomplete data entries (in case they have not been deleted.)
			$nm = "( NOT age=0 AND NOT workclass='?' AND NOT education='?' AND NOT familiy='?' AND NOT occupation='?'AND NOT relationship='?' AND NOT race='?' AND NOT gender='?' AND NOT nation='?' AND NOT income_status='?' )";

			///////////////////////////////////////////////////////////////////////
			/// General (train and test):
			$query_list[0] = "SELECT * WHERE (gender='Male' AND " . $nm  . ") LIMIT " . $query_limit;
			$query_list[1] = "SELECT * WHERE (gender='Female' AND " . $nm . ") LIMIT " . $query_limit;
			/// General (validation):
			$query_list[2] = "SELECT * WHERE id>id_limit LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;

			///////////////////////////////////////////////////////////////////////
			/// Specific subgroups (validation):
			/// Replace the /*tag*/ with a corresponding string to filter for said tag.
			
			/// By education:
			//$query_list[2] = "SELECT * WHERE (education='Bachelors' AND id>$id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
			//$query_list[2] = "SELECT * WHERE (education='Some-college' AND id>$id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
			//$query_list[2] = "SELECT * WHERE (education='/*EDUCATION*/' AND id>$id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
		
			///////////////////////////////////////////////////////////////////////
			/// Groupings of subgroups (validation):
			
			/// By family (married):
			//$query_list[2] = "SELECT * WHERE ((familiy='Married-civ-spouse' OR familiy='Married-AF-spouse') AND id>=$id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
		
			/// By family (single):
			//$query_list[2] = "SELECT * WHERE ((familiy='Divorced' OR familiy='Never-married' OR familiy='Separated' OR familiy='Widowed') AND id<$id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
		
			/// By occupation (physical laborers):
			//$query_list[2] = "SELECT * WHERE ( (occupation='Craft-repair' OR occupation='Handlers-cleaners' OR occupation='Farming-fishing' OR occupation='Priv-house-serv') AND id>=$id_limit AND " . $nm .  ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
		
			/// By outlier:
			//$query_list[2] = "SELECT * WHERE ( ( NOT race='White' AND NOT nation='United-States' AND NOT nation='Outlying-US(Guam-USVI-etc)') AND id>= $id_limit AND " . $nm . ") LIMIT " . $validation_query_limit . " OFFSET " . $offset_step;
		
			/// Switch between train/test and validation samples
			$max_query = 3;
			$query_switch = $max_query-1;
			$additional_data = 0;

			while($cur_query < $max_query){
				print_r($query_list[$cur_query]); echo PHP_EOL;
				$query = $col->query($query_list[$cur_query]);
				echo("Fetching... \n");
				$read_search = $query->fetch($update);
				
				$read_row = $col->create_row();
				$i = $cur_query;
				$r = 0;
				while ($read_search->has_next()) {
					$read_search->get_next($read_row);
					$age = $read_row->get_field_as_integer(1);
					$emp = $read_row->get_field_as_string(2);
					$edc = $read_row->get_field_as_string(4);
					$fam = $read_row->get_field_as_string(6);
					$job = $read_row->get_field_as_string(7);
					$rel = $read_row->get_field_as_string(8);
					$rce = $read_row->get_field_as_string(9);
					$gnd = $read_row->get_field_as_string(10);
					$cnt = $read_row->get_field_as_string(14);
					$inc = $read_row->get_field_as_string(15);
					
					if($cur_query < $query_switch){
						$found_slot = false;
						$slot_num = 0;
						/// Place the query objects in random order into the array.
						while($found_slot == false){
							$slot_num = rand(0, (($max_query-1)*$query_limit)-1);
							if(isset($train_samples[$slot_num])){ }else{$found_slot = true; }
						}
						$train_samples[$slot_num] = $age." ".$emp." ".$edc." ".$fam." ".$job." ".$rel." ".$rce." ".$gnd." ".$cnt;
						$train_targets[$slot_num] = $inc; 
					}
					else if($cur_query >= $query_switch) {
						$found_slot = false;
						$slot_num = 0;
						/// Place the query objects in random order into the array.
						while($found_slot == false){
							$slot_num = rand(0, $validation_query_limit-1);
							$found_slot = !isset($validation_samples[$slot_num]);
						}
						$validation_samples[$slot_num] = $age." ".$emp." ".$edc." ".$fam." ".$job." ".$rel." ".$rce." ".$gnd." ".$cnt;
						$validation_targets[$slot_num] = $inc;
					}
					$r++;
						$i += $max_query;
				}
				$cur_query++;
			}
	
			/// Ensure the php unordered map style array, behaves like an array.
			ksort($train_samples);
			ksort($train_targets);
			ksort($validation_samples);
			ksort($validation_targets);
			echo "Train samples: " . count($train_samples) . " targets: ". count($train_targets) . ", Validation samples: " . count($validation_samples) . "targets: " . count($validation_targets) . PHP_EOL;

			/// Convert samples into ml dataset
			$data = new ArrayDataset($train_samples, $train_targets);
			$samples = $data->getSamples();

			/// Vectorizing and transforming
			echo("Vectorizing... \n");
			$vectorizer = new TokenCountVectorizer(new WordTokenizer());
			$tfIdfTransformer = new TfIdfTransformer();
			$vectorizer->fit($samples);
			$vectorizer->transform($samples);
			echo("Transforming... \n");
			$tfIdfTransformer->fit($samples);
			$tfIdfTransformer->transform($samples);
			echo("Setting up dataset... \n");
			$dataset = new ArrayDataset($samples, $data->getTargets());

			/// Use random split to split the samples into a train and test set
			$randomSplit = new StratifiedRandomSplit($dataset, 0.2, rand(1, 5000));

			/// Setup ml algorithm
			echo("Setting up SVC... \n");
			$classifier = new SVC(
					Kernel::RBF, 	// $kernel
					1.0,            // $cost
					3,              // $degree
					0.01,           // $gamma
					0.0,            // $coef0
					0.001,          // $tolerance
					100,            // $cacheSize
					true,           // $shrinking
					false           // $probabilityEstimates
			);
		
			$vectorizer->transform($validation_samples);
			$tfIdfTransformer->transform($validation_samples);

			/// Training and predicting:
			echo("Training...\n");
			$classifier->train($randomSplit->getTrainSamples(), $randomSplit->getTrainLabels());

			echo("Predicting... \n");
			$predicted_labels = $classifier->predict($randomSplit->getTestSamples());
			echo 'Accuracy: '.Accuracy::score($randomSplit->getTestLabels(), $predicted_labels) . PHP_EOL;
			$accuracy_test_data[$cur_run] = Accuracy::score($randomSplit->getTestLabels(), $predicted_labels);

			$validation_predicted_labels = $classifier->predict($validation_samples);
			echo 'Accuracy: '.Accuracy::score($validation_targets, $validation_predicted_labels) . PHP_EOL;
			$accuracy_validation_data[$cur_run] = Accuracy::score($validation_targets, $validation_predicted_labels);
			$cur_run++;
		}
	
		print_r($accuracy_test_data);
		print_r($accuracy_validation_data);

		echo("Operation completed! \n");

    } catch(GSException $e){
        echo($e->what()."\n");
        echo($e->get_code()."\n");
    }
?>
