Source code for pruneabletree.csv_importer

import warnings

import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

[docs]class CsvImporter(BaseEstimator, TransformerMixin): """Transform a CSV document to a numpy matrix of data such that the data is ready for use by decision tree classifiers. This implies that instances with missing values are removed and that one-hot encoding is applied to all non-numeric columns. The class column is processed with a label encoder. Parameters ---------- encoding : string, 'utf-8' by default. The encoding used to decode the input file. sep : str, default ',' Delimiter to use. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}. Use str or object together with suitable na_values settings to preserve and not interpret dtype. na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. The following values are always interpreted as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'. class_index : int, default -1 (i.e., the last column) Column index of the class attribute. This column will not be present in the transform output, but will be kept separately in the y attribute of this transformer. Multi output scenarios are not supported. missing_threshold : float (percentage), default 0.75 Indicates the least amount of data that must remain after removing instances with missing values without raising a warning. If less remain, a warning will be raised. Attributes ---------- y : numpy array, [n_samples] Data extracted from the CSV based on the given `class_index` and then encoded. This data is not returned by transform, but saved here instead. original_y : numpy array, [n_samples] Same as `y`, but before encoding. """ def __init__(self, encoding='utf-8', sep=',', dtype=None, na_values=None, class_index=-1, missing_threshold=0.75): self.encoding = encoding self.sep = sep self.dtype = dtype self.na_values = na_values self.class_index = class_index self.missing_threshold = missing_threshold self.y = None self.original_y = None
[docs] def fit(self, csv_file, y=None): """Extract data from the given CSV file. Parameters ---------- csv_file : string File path to CSV file. Returns ------- self """ self.fit_transform(csv_file) return self
[docs] def fit_transform(self, csv_file, y=None): """Extract data from the given CSV file and return it as a numpy matrix. This is equivalent to fit followed by transform, but more efficiently implemented. Parameters ---------- csv_file : string File path to CSV file. Returns ------- X : numpy matrix, [n_samples, n_features] Extracted data. """ return self.transform(csv_file)
[docs] def transform(self, csv_file): """Extract data from the given CSV file and return it as a numpy matrix. Parameters ---------- csv_file : string File path to CSV file. Returns ------- X : numpy matrix, [n_samples, n_features] Extracted data. """ # Assumption: class is last column df = pd.read_csv(csv_file, encoding=self.encoding, sep=self.sep, dtype=self.dtype, na_values=self.na_values) n_before = len(df) #TODO alternative: Imputation http://scikit-learn.org/dev/modules/impute.html (assumes Guassian) df.dropna(inplace=True) n_after = len(df) percent_remaining = n_after / n_before percent_lost = 100 * (1 - percent_remaining) if percent_remaining < self.missing_threshold: warnings.warn("Lost {:.0f}% of instances by removing missing values: {} -> {}".format(percent_lost, n_before, n_after)) self.original_y = df.iloc[:, self.class_index].values self.y = LabelEncoder().fit_transform(self.original_y) class_column_name = df.columns[self.class_index] df.drop(class_column_name, axis=1, inplace=True) # apply one hot encoding to non-numeric columns df2 = pd.get_dummies(df) # like CategoricalEncoder return df2.values
# def inverse_transform(self, X): #TODO needed? # raise NotImplementedError("Cannot convert back from data to CSV file.")
[docs] def fit_transform_both(self, csv_file): """Extract data from the given CSV file and return it as a numpy matrix. Also returns the encoded class values at the same time. Parameters ---------- csv_file : string File path to CSV file. Returns ------- X : numpy matrix, [n_samples, n_features] Extracted data. y : numpy array, [n_samples] Data extracted from the CSV based on the given `class_index` and then encoded. """ X = self.fit_transform(csv_file) return X, self.y