# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Mathieu Blondel <mathieu@mblondel.org>
# Olivier Grisel <olivier.grisel@ensta.org>
# Andreas Mueller <amueller@ais.uni-bonn.de>
# Joel Nothman <joel.nothman@gmail.com>
# Hamzeh Alsalhi <ha258@cornell.edu>
# License: BSD 3 clause
import array
import itertools
import warnings
from collections import defaultdict
from numbers import Integral
import numpy as np
import scipy.sparse as sp
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import column_or_1d
from ..utils._encode import _encode, _unique
from ..utils._param_validation import Interval, validate_params
from ..utils.multiclass import type_of_target, unique_labels
from ..utils.sparsefuncs import min_max_axis
from ..utils.validation import _num_samples, check_array, check_is_fitted
__all__ = [
"label_binarize",
"LabelBinarizer",
"LabelEncoder",
"MultiLabelBinarizer",
]
[docs]
class LabelEncoder(TransformerMixin, BaseEstimator):
"""Encode target labels with value between 0 and n_classes-1.
This transformer should be used to encode target values, *i.e.* `y`, and
not the input `X`.
Read more in the :ref:`User Guide <preprocessing_targets>`.
.. versionadded:: 0.12
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
See Also
--------
OrdinalEncoder : Encode categorical features using an ordinal encoding
scheme.
OneHotEncoder : Encode categorical features as a one-hot numeric array.
Examples
--------
`LabelEncoder` can be used to normalize labels.
>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2]...)
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])
It can also be used to transform non-numerical labels (as long as they are
hashable and comparable) to numerical labels.
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
"""
[docs]
def fit(self, y):
"""Fit label encoder.
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
Fitted label encoder.
"""
y = column_or_1d(y, warn=True)
self.classes_ = _unique(y)
return self
[docs]
class LabelBinarizer(TransformerMixin, BaseEstimator):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
At learning time, this simply consists in learning one regressor
or binary classifier per class. In doing so, one needs to convert
multi-class labels to binary labels (belong or does not belong
to the class). `LabelBinarizer` makes this process easy with the
transform method.
At prediction time, one assigns the class for which the corresponding
model gave the greatest confidence. `LabelBinarizer` makes this easy
with the :meth:`inverse_transform` method.
Read more in the :ref:`User Guide <preprocessing_targets>`.
Parameters
----------
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False
True if the returned array from transform is desired to be in sparse
CSR format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
Holds the label for each class.
y_type_ : str
Represents the type of the target data as evaluated by
:func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
'continuous', 'continuous-multioutput', 'binary', 'multiclass',
'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
sparse_input_ : bool
`True` if the input data to transform is given as a sparse matrix,
`False` otherwise.
See Also
--------
label_binarize : Function to perform the transform operation of
LabelBinarizer with fixed classes.
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer()
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
Binary targets transform to a column vector
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
array([[1],
[0],
[0],
[1]])
Passing a 2D matrix for multilabel classification
>>> import numpy as np
>>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
LabelBinarizer()
>>> lb.classes_
array([0, 1, 2])
>>> lb.transform([0, 1, 2, 1])
array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 1, 0]])
"""
_parameter_constraints: dict = {
"neg_label": [Integral],
"pos_label": [Integral],
"sparse_output": ["boolean"],
}
[docs]
def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
self.neg_label = neg_label
self.pos_label = pos_label
self.sparse_output = sparse_output
[docs]
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, y):
"""Fit label binarizer.
Parameters
----------
y : ndarray of shape (n_samples,) or (n_samples, n_classes)
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification.
Returns
-------
self : object
Returns the instance itself.
"""
if self.neg_label >= self.pos_label:
raise ValueError(
f"neg_label={self.neg_label} must be strictly less than "
f"pos_label={self.pos_label}."
)
if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
f"pos_label={self.pos_label} and neg_label={self.neg_label}"
)
self.y_type_ = type_of_target(y, input_name="y")
if "multioutput" in self.y_type_:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
self.sparse_input_ = sp.issparse(y)
self.classes_ = unique_labels(y)
return self
@validate_params(
{
"y": ["array-like"],
"classes": ["array-like"],
"neg_label": [Interval(Integral, None, None, closed="neither")],
"pos_label": [Interval(Integral, None, None, closed="neither")],
"sparse_output": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
"""Binarize labels in a one-vs-all fashion.
Several regression and binary classification algorithms are
available in scikit-learn. A simple way to extend these algorithms
to the multi-class classification case is to use the so-called
one-vs-all scheme.
This function makes it possible to compute this transformation for a
fixed set of class labels known ahead of time.
Parameters
----------
y : array-like
Sequence of integer labels or multilabel data to encode.
classes : array-like of shape (n_classes,)
Uniquely holds the label for each class.
neg_label : int, default=0
Value with which negative labels must be encoded.
pos_label : int, default=1
Value with which positive labels must be encoded.
sparse_output : bool, default=False,
Set to true if output binary array is desired in CSR sparse format.
Returns
-------
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
Shape will be (n_samples, 1) for binary problems. Sparse matrix will
be of CSR format.
See Also
--------
LabelBinarizer : Class used to wrap the functionality of label_binarize and
allow for fitting to classes independently of the transform operation.
Examples
--------
>>> from sklearn.preprocessing import label_binarize
>>> label_binarize([1, 6], classes=[1, 2, 4, 6])
array([[1, 0, 0, 0],
[0, 0, 0, 1]])
The class ordering is preserved:
>>> label_binarize([1, 6], classes=[1, 6, 4, 2])
array([[1, 0, 0, 0],
[0, 1, 0, 0]])
Binary targets transform to a column vector
>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
array([[1],
[0],
[0],
[1]])
"""
if not isinstance(y, list):
# XXX Workaround that will be removed when list of list format is
# dropped
y = check_array(
y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
)
else:
if _num_samples(y) == 0:
raise ValueError("y has 0 samples: %r" % y)
if neg_label >= pos_label:
raise ValueError(
"neg_label={0} must be strictly less than pos_label={1}.".format(
neg_label, pos_label
)
)
if sparse_output and (pos_label == 0 or neg_label != 0):
raise ValueError(
"Sparse binarization is only supported with non "
"zero pos_label and zero neg_label, got "
"pos_label={0} and neg_label={1}"
"".format(pos_label, neg_label)
)
# To account for pos_label == 0 in the dense case
pos_switch = pos_label == 0
if pos_switch:
pos_label = -neg_label
y_type = type_of_target(y)
if "multioutput" in y_type:
raise ValueError(
"Multioutput target data is not supported with label binarization"
)
if y_type == "unknown":
raise ValueError("The type of target data is not known")
n_samples = y.shape[0] if sp.issparse(y) else len(y)
n_classes = len(classes)
classes = np.asarray(classes)
if y_type == "binary":
if n_classes == 1:
if sparse_output:
return sp.csr_matrix((n_samples, 1), dtype=int)
else:
Y = np.zeros((len(y), 1), dtype=int)
Y += neg_label
return Y
elif len(classes) >= 3:
y_type = "multiclass"
sorted_class = np.sort(classes)
if y_type == "multilabel-indicator":
y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
if classes.size != y_n_classes:
raise ValueError(
"classes {0} mismatch with the labels {1} found in the data".format(
classes, unique_labels(y)
)
)
if y_type in ("binary", "multiclass"):
y = column_or_1d(y)
# pick out the known labels from y
y_in_classes = np.in1d(y, classes)
y_seen = y[y_in_classes]
indices = np.searchsorted(sorted_class, y_seen)
indptr = np.hstack((0, np.cumsum(y_in_classes)))
data = np.empty_like(indices)
data.fill(pos_label)
Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
elif y_type == "multilabel-indicator":
Y = sp.csr_matrix(y)
if pos_label != 1:
data = np.empty_like(Y.data)
data.fill(pos_label)
Y.data = data
else:
raise ValueError(
"%s target data is not supported with label binarization" % y_type
)
if not sparse_output:
Y = Y.toarray()
Y = Y.astype(int, copy=False)
if neg_label != 0:
Y[Y == 0] = neg_label
if pos_switch:
Y[Y == pos_label] = 0
else:
Y.data = Y.data.astype(int, copy=False)
# preserve label ordering
if np.any(classes != sorted_class):
indices = np.searchsorted(sorted_class, classes)
Y = Y[:, indices]
if y_type == "binary":
if sparse_output:
Y = Y.getcol(-1)
else:
Y = Y[:, -1].reshape((-1, 1))
return Y
def _inverse_binarize_multiclass(y, classes):
"""Inverse label binarization transformation for multiclass.
Multiclass uses the maximal score instead of a threshold.
"""
classes = np.asarray(classes)
if sp.issparse(y):
# Find the argmax for each row in y where y is a CSR matrix
y = y.tocsr()
n_samples, n_outputs = y.shape
outputs = np.arange(n_outputs)
row_max = min_max_axis(y, 1)[1]
row_nnz = np.diff(y.indptr)
y_data_repeated_max = np.repeat(row_max, row_nnz)
# picks out all indices obtaining the maximum per row
y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
# For corner case where last row has a max of 0
if row_max[-1] == 0:
y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
# Gets the index of the first argmax in each row from y_i_all_argmax
index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
# first argmax of each row
y_ind_ext = np.append(y.indices, [0])
y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
# Handle rows of all 0
y_i_argmax[np.where(row_nnz == 0)[0]] = 0
# Handles rows with max of 0 that contain negative numbers
samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
for i in samples:
ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
return classes[y_i_argmax]
else:
return classes.take(y.argmax(axis=1), mode="clip")
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
"""Inverse label binarization transformation using thresholding."""
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
if output_type != "binary" and y.shape[1] != len(classes):
raise ValueError(
"The number of class is not equal to the number of dimension of y."
)
classes = np.asarray(classes)
# Perform thresholding
if sp.issparse(y):
if threshold > 0:
if y.format not in ("csr", "csc"):
y = y.tocsr()
y.data = np.array(y.data > threshold, dtype=int)
y.eliminate_zeros()
else:
y = np.array(y.toarray() > threshold, dtype=int)
else:
y = np.array(y > threshold, dtype=int)
# Inverse transform data
if output_type == "binary":
if sp.issparse(y):
y = y.toarray()
if y.ndim == 2 and y.shape[1] == 2:
return classes[y[:, 1]]
else:
if len(classes) == 1:
return np.repeat(classes[0], len(y))
else:
return classes[y.ravel()]
elif output_type == "multilabel-indicator":
return y
else:
raise ValueError("{0} format is not supported".format(output_type))
[docs]
class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
"""Transform between iterable of iterables and a multilabel format.
Although a list of sets or tuples is a very intuitive format for multilabel
data, it is unwieldy to process. This transformer converts between this
intuitive format and the supported multilabel format: a (samples x classes)
binary matrix indicating the presence of a class label.
Parameters
----------
classes : array-like of shape (n_classes,), default=None
Indicates an ordering for the class labels.
All entries should be unique (cannot contain duplicate classes).
sparse_output : bool, default=False
Set to True if output binary array is desired in CSR sparse format.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
A copy of the `classes` parameter when provided.
Otherwise it corresponds to the sorted set of classes found
when fitting.
See Also
--------
OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
scheme.
Examples
--------
>>> from sklearn.preprocessing import MultiLabelBinarizer
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
[0, 0, 1]])
>>> mlb.classes_
array([1, 2, 3])
>>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
array([[0, 1, 1],
[1, 0, 0]])
>>> list(mlb.classes_)
['comedy', 'sci-fi', 'thriller']
A common mistake is to pass in a list, which leads to the following issue:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
MultiLabelBinarizer()
>>> mlb.classes_
array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
'y'], dtype=object)
To correct this, the list of labels should be passed in as:
>>> mlb = MultiLabelBinarizer()
>>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
MultiLabelBinarizer()
>>> mlb.classes_
array(['comedy', 'sci-fi', 'thriller'], dtype=object)
"""
_parameter_constraints: dict = {
"classes": ["array-like", None],
"sparse_output": ["boolean"],
}
[docs]
def __init__(self, *, classes=None, sparse_output=False):
self.classes = classes
self.sparse_output = sparse_output
[docs]
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, y):
"""Fit the label sets binarizer, storing :term:`classes_`.
Parameters
----------
y : iterable of iterables
A set of labels (any orderable and hashable object) for each
sample. If the `classes` parameter is set, `y` will not be
iterated.
Returns
-------
self : object
Fitted estimator.
"""
self._cached_dict = None
if self.classes is None:
classes = sorted(set(itertools.chain.from_iterable(y)))
elif len(set(self.classes)) < len(self.classes):
raise ValueError(
"The classes argument contains duplicate "
"classes. Remove these duplicates before passing "
"them to MultiLabelBinarizer."
)
else:
classes = self.classes
dtype = int if all(isinstance(c, int) for c in classes) else object
self.classes_ = np.empty(len(classes), dtype=dtype)
self.classes_[:] = classes
return self
[docs]
def _build_cache(self):
if self._cached_dict is None:
self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
return self._cached_dict