import numpy as np
import os
import sys


# arguments:
#    - pathname: source file of the data
#    - labels_to_ints: a dictionary that maps original class labels (which 
#           can be ints or strings) to consecutive ints starting at 0
#           Note that the function MODIFIES this argument to store new
#           mappings that it creates while reading the file.
#    - ints_to_labels: a dictionary that maps int labels to original class 
#           labels (which can be ints or strings).
#           Note that the function MODIFIES this argument to store new
#           mappings that it creates while reading the file.
# returns:
#    - data: a 2D numpy array, where each row is an input.
#    - labels: a numpy column vector. That means it is a 2D numpy array, 
#           with a single column. labels[i,0] is the class label for
#           for the object stored at data[i].
def read_uci_file(pathname, labels_to_ints, ints_to_labels):
    if not(os.path.isfile(pathname)):
        print("read_data: %s not found", pathname)
        return None

    in_file = open(pathname)
    file_lines = in_file.readlines()
    in_file.close()

    rows = len(file_lines)
    if (rows == 0):
        print("read_data: zero rows in %s", pathname)
        return None
        
    
    cols = len(file_lines[0].split())
    data = np.zeros((rows, cols-1))
    labels = np.zeros((rows,1))
    for row in range(0, rows):
        line = file_lines[row].strip()
        items = line.split()
        if (len(items) != cols):
            print("read_data: Line %d, %d columns expected, %d columns found" %(row, cols, len(items)))
            return None
        for col in range(0, cols-1):
            data[row][col] = float(items[col])
        
        # the last column is a string representing the class label
        label = items[cols-1]
        if (label in labels_to_ints):
            ilabel = labels_to_ints[label]
        else:
            ilabel = len(labels_to_ints)
            labels_to_ints[label] = ilabel
            ints_to_labels[ilabel] = label
        
        labels[row] = ilabel

    labels = labels.astype(int)
    return (data, labels)


# arguments:
#    - directory: the pathname of the folder where the dataset is stored.
#    - dataset_name: the name of the dataset, such as "pendigits" or "yeast".
# returns a tuple of three items, where each item is itself a pair, so 
# overall the function returns six values.
#    ((train_data, train_labels), (test_data, test_labels), (ints_to_labels, labels_to_ints))
#    - train_data: a 2D numpy array, where each row is a training input object.
#    - train_labels: a numpy column vector. That means it is a 2D numpy array, 
#           with a single column. train_labels[i,0] is the class label for
#           the object stored at train_data[i].
#    - test_data: a 2D numpy array, where each row is a test input object.
#    - test_labels: a numpy column vector. That means it is a 2D numpy array, 
#           with a single column. test_labels[i,0] is the class label for
#           the object stored at test_data[i].
#    - labels_to_ints: a dictionary that maps original class labels (which 
#           can be ints or strings) to consecutive ints starting at 0
#    - ints_to_labels: a dictionary that maps int labels to original class 
#           labels (which can be ints or strings).
def read_uci_dataset(directory, dataset_name):
    training_file = directory + "/" + dataset_name + "_training.txt"
    test_file = directory + "/" + dataset_name + "_test.txt"

    labels_to_ints = {}
    ints_to_labels = {}

    (train_data, train_labels) = read_uci_file(training_file, labels_to_ints, ints_to_labels)
    (test_data, test_labels) = read_uci_file(test_file, labels_to_ints, ints_to_labels)
    return ((train_data, train_labels), (test_data, test_labels), (ints_to_labels, labels_to_ints))