Source code for pruneabletree.csv_importer
import warnings
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
[docs]class CsvImporter(BaseEstimator, TransformerMixin):
"""Transform a CSV document to a numpy matrix of data such that the data
is ready for use by decision tree classifiers. This implies that instances
with missing values are removed and that one-hot encoding is applied to
all non-numeric columns. The class column is processed with a label encoder.
Parameters
----------
encoding : string, 'utf-8' by default.
The encoding used to decode the input file.
sep : str, default ','
Delimiter to use.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}.
Use str or object together with suitable na_values settings to preserve
and not interpret dtype.
na_values : scalar, str, list-like, or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. The following values are always interpreted
as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan',
'null'.
class_index : int, default -1 (i.e., the last column)
Column index of the class attribute. This column will not be present
in the transform output, but will be kept separately in the y attribute
of this transformer. Multi output scenarios are not supported.
missing_threshold : float (percentage), default 0.75
Indicates the least amount of data that must remain after removing
instances with missing values without raising a warning. If less remain,
a warning will be raised.
Attributes
----------
y : numpy array, [n_samples]
Data extracted from the CSV based on the given `class_index` and then encoded. This data
is not returned by transform, but saved here instead.
original_y : numpy array, [n_samples]
Same as `y`, but before encoding.
"""
def __init__(self, encoding='utf-8', sep=',', dtype=None, na_values=None, class_index=-1, missing_threshold=0.75):
self.encoding = encoding
self.sep = sep
self.dtype = dtype
self.na_values = na_values
self.class_index = class_index
self.missing_threshold = missing_threshold
self.y = None
self.original_y = None
[docs] def fit(self, csv_file, y=None):
"""Extract data from the given CSV file.
Parameters
----------
csv_file : string
File path to CSV file.
Returns
-------
self
"""
self.fit_transform(csv_file)
return self
[docs] def fit_transform(self, csv_file, y=None):
"""Extract data from the given CSV file and return it as a numpy matrix.
This is equivalent to fit followed by transform, but more efficiently
implemented.
Parameters
----------
csv_file : string
File path to CSV file.
Returns
-------
X : numpy matrix, [n_samples, n_features]
Extracted data.
"""
return self.transform(csv_file)
[docs] def transform(self, csv_file):
"""Extract data from the given CSV file and return it as a numpy matrix.
Parameters
----------
csv_file : string
File path to CSV file.
Returns
-------
X : numpy matrix, [n_samples, n_features]
Extracted data.
"""
# Assumption: class is last column
df = pd.read_csv(csv_file, encoding=self.encoding, sep=self.sep, dtype=self.dtype, na_values=self.na_values)
n_before = len(df)
#TODO alternative: Imputation http://scikit-learn.org/dev/modules/impute.html (assumes Guassian)
df.dropna(inplace=True)
n_after = len(df)
percent_remaining = n_after / n_before
percent_lost = 100 * (1 - percent_remaining)
if percent_remaining < self.missing_threshold:
warnings.warn("Lost {:.0f}% of instances by removing missing values: {} -> {}".format(percent_lost, n_before, n_after))
self.original_y = df.iloc[:, self.class_index].values
self.y = LabelEncoder().fit_transform(self.original_y)
class_column_name = df.columns[self.class_index]
df.drop(class_column_name, axis=1, inplace=True)
# apply one hot encoding to non-numeric columns
df2 = pd.get_dummies(df) # like CategoricalEncoder
return df2.values
# def inverse_transform(self, X): #TODO needed?
# raise NotImplementedError("Cannot convert back from data to CSV file.")
[docs] def fit_transform_both(self, csv_file):
"""Extract data from the given CSV file and return it as a numpy matrix.
Also returns the encoded class values at the same time.
Parameters
----------
csv_file : string
File path to CSV file.
Returns
-------
X : numpy matrix, [n_samples, n_features]
Extracted data.
y : numpy array, [n_samples]
Data extracted from the CSV based on the given `class_index` and then encoded.
"""
X = self.fit_transform(csv_file)
return X, self.y