diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 29ed58d..6624217 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# PyTorch-NLP +Basic Utilities for PyTorch Natural Language Processing (NLP)

Basic Utilities for PyTorch Natural Language Processing (NLP)

@@ -45,7 +47,6 @@ Load the IMDB dataset, for example: ```python from torchnlp.datasets import imdb_dataset - # Load the imdb training dataset train = imdb_dataset(train=True) train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'} @@ -55,17 +56,13 @@ Load a custom dataset, for example: ```python from pathlib import Path - from torchnlp.download import download_file_maybe_extract - directory_path = Path('data/') train_file_path = Path('trees/train.txt') - download_file_maybe_extract( url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip', directory=directory_path, check_files=[train_file_path]) - open(directory_path / train_file_path) ``` @@ -80,7 +77,6 @@ text into tokens whenever it encounters a whitespace character. ```python from torchnlp.encoders.text import WhitespaceEncoder - loaded_data = ["now this ain't funny", "so don't you dare laugh"] encoder = WhitespaceEncoder(loaded_data) encoded_data = [encoder.encode(example) for example in loaded_data] @@ -95,13 +91,10 @@ import torch from torchnlp.samplers import BucketBatchSampler from torchnlp.utils import collate_tensors from torchnlp.encoders.text import stack_and_pad_tensors - encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)] - train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data) train_batch_sampler = BucketBatchSampler( train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0]) - batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler] batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches] ``` @@ -128,9 +121,7 @@ Wrap any code that's random, with `fork_rng` and you'll be good to go, like so: import random import numpy import torch - from torchnlp.random import fork_rng - with fork_rng(seed=123): # Ensure determinism print('Random:', random.randint(1, 2**31)) print('Numpy:', numpy.random.randint(1, 2**31)) @@ -154,9 +145,7 @@ pre-trained word vectors to set your embeddings, like so: import torch from torchnlp.encoders.text import WhitespaceEncoder from torchnlp.word_to_vector import GloVe - encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"]) - vocab_set = set(encoder.vocab) pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set) embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim) @@ -171,10 +160,8 @@ For example, from the neural network package, apply the state-of-the-art `Locked ```python import torch from torchnlp.nn import LockedDropout - input_ = torch.randn(6, 3, 10) dropout = LockedDropout(0.5) - # Apply a LockedDropout to `input_` dropout(input_) # RETURNS: torch.FloatTensor (6x3x10) ``` @@ -185,10 +172,8 @@ Compute common NLP metrics such as the BLEU score. ```python from torchnlp.metrics import get_moses_multi_bleu - hypotheses = ["The brown fox jumps over the dog 笑"] references = ["The quick brown fox jumps over the lazy dog 笑"] - # Compute BLEU score with the official BLEU perl script get_moses_multi_bleu(hypotheses, references, lowercase=True) # RETURNS: 47.9 ``` diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py new file mode 100644 index 0000000..2d997be --- /dev/null +++ b/torchnlp/metrics/rouge.py @@ -0,0 +1,112 @@ + + +def _get_ngrams(n, text): + + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + + +def _get_word_ngrams(n, sentences): + """Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = split_into_words(sentences) + return _get_ngrams(n, words) + + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score + + +def len_lcs(x, y): + + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table[n, m] + + +def split_into_words(sentences): + """Splits multiple sentences into words and flattens the result""" + return list(sentences.split(" ")) + + +def rogue_l(candidate, references): + lcs = len_lcs(candidate, references) + len_x = len(candidate) + len_y = len(references) + + recall = lcs / len_y + precision = lcs / len_x + beta = precision / (recall + 1e-12) + numerator = (1 + (beta ** 2)) * (precision * recall) + denominator = (precision * (beta ** 2) + recall) + 1e-8 + f1_score = numerator / denominator + return f1_score + + +def average_rouge(candidate, references): + rouge_1 = rouge_n(candidate, references, 1) + rouge_2 = rouge_n(candidate, references, 2) + rouge_lcs = rogue_l(split_into_words(candidate), + split_into_words(references)) + avg_rouge = (rouge_1 + rouge_2 + rouge_lcs) / 3 + print("rouge_1:", rouge_1) + print("rouge_2:", rouge_2) + print("rouge_lcs:", rouge_lcs) + print("average:", avg_rouge) + + +def main(): + x = "The quick brown fox jumped over the wall" + y = "The fast black dog and fox jumped into the wall" + x_words = split_into_words(x) + y_words = split_into_words(y) + print(x_words) + lcs = len_lcs(x_words, y_words) + print(lcs) + average_rouge(x, y) + + +if __name__ == "__main__": + main()