diff --git a/src/sagemaker_sklearn_extension/feature_extraction/embedtext.py b/src/sagemaker_sklearn_extension/feature_extraction/embedtext.py new file mode 100644 index 0000000..6b1d537 --- /dev/null +++ b/src/sagemaker_sklearn_extension/feature_extraction/embedtext.py @@ -0,0 +1,307 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import gluonnlp as nlp +import numpy as np +import scipy.sparse as sp + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import VectorizerMixin, TfidfVectorizer +from sklearn.utils.validation import check_array, check_is_fitted + + +def load_gluonnlp_embedding(which_embedding='FastText', source='wiki-news-300d-1M', load_ngrams=False): + """Loads GluonNLP embedding in memory. + FastText: + nlp.embedding.FastText(source='wiki-news-300d-1M', load_ngrams=False) - 654 MB + nlp.embedding.FastText(source='crawl-300d-2M', load_ngrams=False) - 1665 MB + nlp.embedding.FastText(source='wiki-news-300d-1M-subword', load_ngrams=True) - 6902+1063 + nlp.embedding.FastText(source='crawl-300d-2M-subword', load_ngrams=True) - 6901+1217 + glove: + nlp.embedding.GloVe(source = 'glove.6B.100d') - 144 MB + nlp.embedding.GloVe(source = 'glove.6B.200d') - 284 MB + nlp.embedding.GloVe(source = 'glove.6B.300d') - 424 MB + nlp.embedding.GloVe(source = 'glove.42B.300d') - 2037 MB + nlp.embedding.GloVe(source = 'glove.840B.300d') - 2384 MB + Word2Vec: + nlp.embedding.Word2Vec(source='GoogleNews-vectors-negative300') - 2856 MB + nlp.embedding.Word2Vec(source='freebase-vectors-skipgram1000') - 3404 MB + nlp.embedding.Word2Vec(source='freebase-vectors-skipgram1000-en') - 3413 MB + """ + if which_embedding is 'FastText': + return nlp.embedding.FastText(source=source, load_ngrams=load_ngrams, + embedding_root="../") + elif which_embedding is 'glove': + return nlp.embedding.GloVe(source=source, embedding_root="../") + elif which_embedding is 'Word2Vec': + return nlp.embedding.Word2Vec(source=source, embedding_root="../") + else: + return None + + +class MultiColumnTextEmbeddingTransformer(BaseEstimator, VectorizerMixin, TransformerMixin): + """Applies ``sklearn.feature_extraction.text.MultiColumnTfidfVectorizer`` + to each column in an array followed by fastText embedding. + + Each column of text is treated separately with a unique TfidfVectorizer. The vectorizers are applied sequentially. + + Parameters + ---------- + strip_accents : {'ascii', 'unicode', None} (default=None) + Remove accents and perform other character normalization during the preprocessing step. + 'ascii' is a fast method that only works on characters that have an direct ASCII mapping. + 'unicode' is a slightly slower method that works on any characters. + None (default) does nothing. + + Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. + + lowercase : boolean (default=True) + Convert all characters to lowercase before tokenizing. + + preprocessor : callable or None (default=None) + Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams + generation steps. + + tokenizer : callable or None (default=None) + Override the string tokenization step while preserving the preprocessing and n-grams generation steps. + Only applies if ``analyzer == 'word'``. + + stop_words : string {'english'}, list, or None (default) + If 'english', a built-in stop word list for English is used. + There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). + + If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. + Only applies if ``analyzer == 'word'``. + + If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically + detect and filter stop words based on intra corpus document frequency of terms. + + token_pattern : string + Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp + select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a + token separator). + + ngram_range : tuple (min_n, max_n) (default=(1, 1)) + The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n + such that min_n <= n <= max_n will be used. + + analyzer : string, {'word', 'char', 'char_wb'} or callable + Whether the feature should be made of word or character n-grams. + Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words + are padded with space. + + If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. + + max_df : float in range [0.0, 1.0] or int (default=1.0) + When building the vocabulary ignore terms that have a document frequency strictly higher than the given + threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + + min_df : float in range [0.0, 1.0] or int (default=1) + When building the vocabulary ignore terms that have a document frequency strictly lower than the given + threshold. This value is also called cut-off in the literature. + If float, the parameter represents a proportion of documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + + max_features : int or None (default=1000) + If not None, build a vocabulary that only consider the top max_features ordered by term frequency across + the corpus. + This parameter is ignored if vocabulary is not None. + + vocabulary : Mapping or iterable, optional (default=None) + Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an + iterable over terms. If not given, a vocabulary is determined from the input. + + dtype : type, optional (default=float64) + Type of the matrix returned by fit_transform() or transform(). + + norm : 'l1', 'l2' or None, optional (default='l2') + Each output row will have unit norm, either: + * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product + when l2 norm has been applied. + * 'l1': Sum of absolute values of vector elements is 1. + See :func:`preprocessing.normalize` + + use_idf : boolean (default=True) + Enable inverse-document-frequency reweighting. + + smooth_idf : boolean (default=True) + Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every + term in the collection exactly once. Prevents zero divisions. + + sublinear_tf : boolean (default=False) + Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). + + vocabulary_sizes : list(int) (default=None) + Specify the exact vocabulary size to use while encoding each column in the input dataset. The vocabulary size + of a column corresponds to the number of features in its TF-IDF encoding, before the feature matrices are + concatenated. If the feature matrix of column ``i`` has more features than the corresponding vocabulary size, + only the first ``vocabulary_sizes[i]`` features are kept. If the feature matrix of column ``i`` has fewer + features than the corresponding vocabulary size, zero columns are added to the feature matrix until it has + ``vocabulary_sizes[i]`` features. This parameter is useful if the total number of features of the encoding + has to be constant. + + ignore_columns_with_zero_vocabulary_size : boolean (default=True) + Allow ValueErrors thrown by ``sklearn.feature_extraction.text.TfidfVectorizer`` because of over-pruning + of terms to be ignored and an empty ``scipy.sparse.csr_matrix`` to be used in place of the given columns + TF-IDF document-term matrix. + + Attributes + ---------- + vectorizers_ : list of ``sklearn.feature_extraction.text.TfidfVectorizers`` + List of ``sklearn.feature_extraction.text.TfidfVectorizers``. Each TfidfVectorizer is separately instantiated + on an input column. len(self.vectorizers_) should equal to the number of input columns. + + Notes + ----- + MultiColumnTfidfVectorizer should be used with 2D arrays of text strings, for 1D arrays of text data, use + ``sklearn.feature_extraction.text.TfidfVectorizer`` or reshape using array.reshape(-1, 1) + """ + + def __init__( + self, + strip_accents=None, + lowercase=True, + preprocessor=None, + tokenizer=nlp.data.SacreMosesTokenizer(), + stop_words=None, + token_pattern=r"(?u)\b\w\w+\b", + ngram_range=(1, 1), + analyzer="word", + max_df=1.0, + min_df=1, + max_features=int(10 ** 6), + vocabulary=None, + dtype=np.float64, + norm="l1", + use_idf=True, + smooth_idf=True, + sublinear_tf=False, + vocabulary_sizes=None, + ignore_columns_with_zero_vocabulary_size=True, + ): + self.strip_accents = strip_accents + self.lowercase = lowercase + self.preprocessor = preprocessor + self.tokenizer = tokenizer + self.stop_words = stop_words + self.token_pattern = token_pattern + self.ngram_range = ngram_range + self.analyzer = analyzer + self.max_df = max_df + self.min_df = min_df + self.max_features = max_features + self.vocabulary = vocabulary + self.dtype = dtype + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + self.sublinear_tf = sublinear_tf + self.vocabulary_sizes = vocabulary_sizes + self.ignore_columns_with_zero_vocabulary_size = ignore_columns_with_zero_vocabulary_size + + def fit(self, X, y=None): + check_array(X, dtype=None) + return self + + def _fit(self, X, y=None): + """Build the list of TfidfVectorizers for each column. + + Parameters + ---------- + X : {array-like}, text data + + Returns + ------- + self : MultiColumnTfidfVectorizer + """ + X = check_array(X, dtype=None) + n_columns = X.shape[1] + + # If specified, vocabulary size must be given for each column of the input dataset. + if self.vocabulary_sizes and len(self.vocabulary_sizes) != n_columns: + raise ValueError("If specified, vocabulary_sizes has to have exactly one entry per data column.") + + self.vectorizers_ = [] + for col_idx in range(n_columns): + max_features = self.max_features + + # Override max_features for the current column in order to enforce the vocabulary size. + if self.max_features and self.vocabulary_sizes: + max_features = min(self.max_features, self.vocabulary_sizes[col_idx]) + elif self.vocabulary_sizes: + max_features = self.vocabulary_sizes[col_idx] + + try: + vectorizer = TfidfVectorizer( + strip_accents=self.strip_accents, + lowercase=self.lowercase, + preprocessor=self.preprocessor, + tokenizer=self.tokenizer, + stop_words=self.stop_words, + token_pattern=self.token_pattern, + ngram_range=self.ngram_range, + analyzer=self.analyzer, + max_df=self.max_df, + min_df=self.min_df, + max_features=max_features, + vocabulary=self.vocabulary, + dtype=self.dtype, + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf, + ) + vectorizer.fit(X[:, col_idx]) + except ValueError as err: + zero_vocab_errors = [ + "After pruning, no terms remain. Try a lower min_df or a higher max_df.", + "max_df corresponds to < documents than min_df", + "empty vocabulary; perhaps the documents only contain stop words", + ] + if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size: + vectorizer = None + else: + raise + + self.vectorizers_.append(vectorizer) + return self + + def transform(self, X, y=None): + self._fit(X) + check_is_fitted(self, "vectorizers_") + X = check_array(X, dtype=None) + embedding = load_gluonnlp_embedding(which_embedding='FastText', source='wiki-news-300d-1M', + load_ngrams=False) + embedding_dim = 300 + ret = [] + for col_idx in range(X.shape[1]): + if self.vectorizers_[col_idx]: + tfidf_features = self.vectorizers_[col_idx].transform(X[:, col_idx]) + feature_names = self.vectorizers_[col_idx].get_feature_names() + feature_embedding = embedding[feature_names].asnumpy() + embedded_column = sp.csr_matrix(tfidf_features.dot(feature_embedding)) + else: + # If ``TfidfVectorizer`` threw a value error, add an empty TF-IDF document-term + # matrix for the column + print("vectorizer failed", self.max_df, self.min_df) + embedded_column = sp.csr_matrix((X.shape[0], embedding_dim)) + ret.append(embedded_column) + return sp.hstack(ret) + + def _more_tags(self): + return {"X_types": ["string"]} + + +