From dc1f2d90828542bee4ad8c0107d8907c5d841cdf Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 Date: Tue, 6 Apr 2021 14:32:50 +0530 Subject: [PATCH 1/8] Rogue metric added. --- torchnlp/metrics/rogue.py | 122 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 torchnlp/metrics/rogue.py diff --git a/torchnlp/metrics/rogue.py b/torchnlp/metrics/rogue.py new file mode 100644 index 0000000..a3b868f --- /dev/null +++ b/torchnlp/metrics/rogue.py @@ -0,0 +1,122 @@ +from __future__ import division +from itertools import chain + + +def get_unigram_count(tokens): #frequency map + count_dict = dict() + for t in tokens: + if t in count_dict: + count_dict[t] += 1 + else: + count_dict[t] = 1 + + return count_dict + + +class Rouge: + beta = 1 + + @staticmethod + def my_lcs_grid(x, y): + n = len(x) + m = len(y) + + table = [[0 for i in range(m + 1)] for j in range(n + 1)] + + for j in range(m + 1): + for i in range(n + 1): + if i == 0 or j == 0: + cell = (0, 'e') + elif x[i - 1] == y[j - 1]: + cell = (table[i - 1][j - 1][0] + 1, '\\') + else: + over = table[i - 1][j][0] + left = table[i][j - 1][0] + + if left < over: + cell = (over, '^') + else: + cell = (left, '<') + + table[i][j] = cell + print(table) + return table + + @staticmethod + def my_lcs(x, y, mask_x): + table = Rouge.my_lcs_grid(x, y) + i = len(x) + j = len(y) + + while i > 0 and j > 0: + move = table[i][j][1] + if move == '\\': + mask_x[i - 1] = 1 + i -= 1 + j -= 1 + elif move == '^': + i -= 1 + elif move == '<': + j -= 1 + print(mask_x) + return mask_x + + @staticmethod + def rouge_l(cand_sents, ref_sents): + lcs_scores = 0.0 + cand_unigrams = get_unigram_count(chain(*cand_sents)) + ref_unigrams = get_unigram_count(chain(*ref_sents)) + for cand_sent in cand_sents: + cand_token_mask = [0 for t in cand_sent] + cand_len = len(cand_sent) + for ref_sent in ref_sents: + # aligns = [] + # Rouge.lcs(ref_sent, cand_sent, aligns) + Rouge.my_lcs(cand_sent, ref_sent, cand_token_mask) + + # for i in aligns: + # ref_token_mask[i] = 1 + # lcs = [] + cur_lcs_score = 0.0 + for i in range(cand_len): + if cand_token_mask[i]: + token = cand_sent[i] + if cand_unigrams[token] > 0 and ref_unigrams[token] > 0: + cand_unigrams[token] -= 1 + ref_unigrams[token] -= 1 + cur_lcs_score += 1 + + # lcs.append(token) + + # print ' '.join(lcs) + + lcs_scores += cur_lcs_score + + # print "lcs_scores: %d" % lcs_scores + ref_words_count = sum(len(s) for s in ref_sents) + # print "ref_words_count: %d" % ref_words_count + cand_words_count = sum(len(s) for s in cand_sents) + # print "cand_words_count: %d" % cand_words_count + + precision = lcs_scores / cand_words_count + recall = lcs_scores / ref_words_count + f_score = (1 + Rouge.beta ** 2) * precision * recall / (recall + + Rouge.beta ** 2 * precision + 1e-7) + 1e-6 # prevent underflow + return precision, recall, f_score + + # @staticmethod + # def rouge_2(cand_sents, ref_sents): + # cand_bigram_counts = get_bigram_counts(cand_sents) + # ref_bigram_counts = get_bigram_counts(ref_sents) + + +if __name__ == '__main__': + r = Rouge() + # A simple eample of how rouge can be calculated + print(r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], + [[1, 2, 3, 4, 5], [3, 9, 5]])) + + # A more practical example of how it can be used for summary evaluation + system_generated_summary = "The quick fox jumped over the fence" + manual_summmary = "The fast brown fox jumped over the wall" + print(r.rouge_l([system_generated_summary], [manual_summmary])) \ No newline at end of file From f82c5c267284ce8b2f94a6171ac31f41d1e3285d Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 <51365904+PriyaDeshpande1605@users.noreply.github.com> Date: Wed, 7 Apr 2021 20:41:05 +0530 Subject: [PATCH 2/8] Updated rouge.py --- torchnlp/metrics/rogue.py | 122 -------------------------------------- torchnlp/metrics/rouge.py | 105 ++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 122 deletions(-) delete mode 100644 torchnlp/metrics/rogue.py create mode 100644 torchnlp/metrics/rouge.py diff --git a/torchnlp/metrics/rogue.py b/torchnlp/metrics/rogue.py deleted file mode 100644 index a3b868f..0000000 --- a/torchnlp/metrics/rogue.py +++ /dev/null @@ -1,122 +0,0 @@ -from __future__ import division -from itertools import chain - - -def get_unigram_count(tokens): #frequency map - count_dict = dict() - for t in tokens: - if t in count_dict: - count_dict[t] += 1 - else: - count_dict[t] = 1 - - return count_dict - - -class Rouge: - beta = 1 - - @staticmethod - def my_lcs_grid(x, y): - n = len(x) - m = len(y) - - table = [[0 for i in range(m + 1)] for j in range(n + 1)] - - for j in range(m + 1): - for i in range(n + 1): - if i == 0 or j == 0: - cell = (0, 'e') - elif x[i - 1] == y[j - 1]: - cell = (table[i - 1][j - 1][0] + 1, '\\') - else: - over = table[i - 1][j][0] - left = table[i][j - 1][0] - - if left < over: - cell = (over, '^') - else: - cell = (left, '<') - - table[i][j] = cell - print(table) - return table - - @staticmethod - def my_lcs(x, y, mask_x): - table = Rouge.my_lcs_grid(x, y) - i = len(x) - j = len(y) - - while i > 0 and j > 0: - move = table[i][j][1] - if move == '\\': - mask_x[i - 1] = 1 - i -= 1 - j -= 1 - elif move == '^': - i -= 1 - elif move == '<': - j -= 1 - print(mask_x) - return mask_x - - @staticmethod - def rouge_l(cand_sents, ref_sents): - lcs_scores = 0.0 - cand_unigrams = get_unigram_count(chain(*cand_sents)) - ref_unigrams = get_unigram_count(chain(*ref_sents)) - for cand_sent in cand_sents: - cand_token_mask = [0 for t in cand_sent] - cand_len = len(cand_sent) - for ref_sent in ref_sents: - # aligns = [] - # Rouge.lcs(ref_sent, cand_sent, aligns) - Rouge.my_lcs(cand_sent, ref_sent, cand_token_mask) - - # for i in aligns: - # ref_token_mask[i] = 1 - # lcs = [] - cur_lcs_score = 0.0 - for i in range(cand_len): - if cand_token_mask[i]: - token = cand_sent[i] - if cand_unigrams[token] > 0 and ref_unigrams[token] > 0: - cand_unigrams[token] -= 1 - ref_unigrams[token] -= 1 - cur_lcs_score += 1 - - # lcs.append(token) - - # print ' '.join(lcs) - - lcs_scores += cur_lcs_score - - # print "lcs_scores: %d" % lcs_scores - ref_words_count = sum(len(s) for s in ref_sents) - # print "ref_words_count: %d" % ref_words_count - cand_words_count = sum(len(s) for s in cand_sents) - # print "cand_words_count: %d" % cand_words_count - - precision = lcs_scores / cand_words_count - recall = lcs_scores / ref_words_count - f_score = (1 + Rouge.beta ** 2) * precision * recall / (recall + - Rouge.beta ** 2 * precision + 1e-7) + 1e-6 # prevent underflow - return precision, recall, f_score - - # @staticmethod - # def rouge_2(cand_sents, ref_sents): - # cand_bigram_counts = get_bigram_counts(cand_sents) - # ref_bigram_counts = get_bigram_counts(ref_sents) - - -if __name__ == '__main__': - r = Rouge() - # A simple eample of how rouge can be calculated - print(r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], - [[1, 2, 3, 4, 5], [3, 9, 5]])) - - # A more practical example of how it can be used for summary evaluation - system_generated_summary = "The quick fox jumped over the fence" - manual_summmary = "The fast brown fox jumped over the wall" - print(r.rouge_l([system_generated_summary], [manual_summmary])) \ No newline at end of file diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py new file mode 100644 index 0000000..9aacc37 --- /dev/null +++ b/torchnlp/metrics/rouge.py @@ -0,0 +1,105 @@ +import itertools +import numpy as np + +def _get_ngrams(n, text): + + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + +def _get_word_ngrams(n, sentences): + """Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = split_into_words(sentences) + return _get_ngrams(n, words) + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score + +def len_lcs(x, y): + + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table[n,m] + + +def split_into_words(sentences): + """Splits multiple sentences into words and flattens the result""" + return list(sentences.split(" ")) + + +def rogue_l( candidate, references ): + lcs = len_lcs ( candidate, references ) + len_x = len(candidate) + len_y = len(references) + + recall = lcs / len_y + precision = lcs / len_x + beta = precision/ (recall + 1e-12) + numerator = (1 + (beta ** 2 ) )* ( precision * recall ) + denominator = ( precision* ( beta ** 2 ) + recall ) + 1e-8 + f1_score = numerator/ denominator + return f1_score + +def average_rouge ( candidate, references ): + rouge_1 = rouge_n( candidate, references, 1 ) + rouge_2 = rouge_n( candidate, references, 2 ) + rouge_lcs = rogue_l( split_into_words(candidate), split_into_words(references) ) + avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3 + print("rouge_1:", rouge_1) + print("rouge_2:", rouge_2) + print("rouge_lcs:", rouge_lcs) + print("average:" ,avg_rouge) + +def main(): + x = "The quick brown fox jumped over the wall" + y = "The fast black dog and fox jumped into the wall" + x_words = split_into_words(x) + y_words = split_into_words(y) + print(x_words) + lcs = len_lcs(x_words,y_words) + average_rouge(x, y ) + +if __main__ == "main": + main() From 92dee4c70e5ec6a408be117e3e3ebc40fc72c69d Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 <51365904+PriyaDeshpande1605@users.noreply.github.com> Date: Wed, 7 Apr 2021 21:58:51 +0530 Subject: [PATCH 3/8] Update rouge.py --- torchnlp/metrics/rouge.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py index 9aacc37..8ef760e 100644 --- a/torchnlp/metrics/rouge.py +++ b/torchnlp/metrics/rouge.py @@ -93,13 +93,13 @@ def average_rouge ( candidate, references ): print("average:" ,avg_rouge) def main(): - x = "The quick brown fox jumped over the wall" - y = "The fast black dog and fox jumped into the wall" - x_words = split_into_words(x) - y_words = split_into_words(y) - print(x_words) - lcs = len_lcs(x_words,y_words) - average_rouge(x, y ) - -if __main__ == "main": + x = "The quick brown fox jumped over the wall" + y = "The fast black dog and fox jumped into the wall" + x_words = split_into_words(x) + y_words = split_into_words(y) + print(x_words) + lcs = len_lcs(x_words,y_words) + average_rouge(x, y ) + +if __name__ == "__main__": main() From 6f4d00c68516a1497505962b85977abba8f5ff00 Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 <51365904+PriyaDeshpande1605@users.noreply.github.com> Date: Thu, 8 Apr 2021 11:11:14 +0530 Subject: [PATCH 4/8] Delete README.md Trying to fix a build issue --- README.md | 247 ------------------------------------------------------ 1 file changed, 247 deletions(-) delete mode 100755 README.md diff --git a/README.md b/README.md deleted file mode 100755 index 29ed58d..0000000 --- a/README.md +++ /dev/null @@ -1,247 +0,0 @@ -

- -

Basic Utilities for PyTorch Natural Language Processing (NLP)

- -PyTorch-NLP, or `torchnlp` for short, is a library of basic utilities for PyTorch -NLP. `torchnlp` extends PyTorch to provide you with -basic text data processing functions. - -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-nlp.svg?style=flat-square) -[![Codecov](https://img.shields.io/codecov/c/github/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://codecov.io/gh/PetrochukM/PyTorch-NLP) -[![Downloads](http://pepy.tech/badge/pytorch-nlp)](http://pepy.tech/project/pytorch-nlp) -[![Documentation Status](https://img.shields.io/readthedocs/pytorchnlp/latest.svg?style=flat-square)](http://pytorchnlp.readthedocs.io/en/latest/?badge=latest&style=flat-square) -[![Build Status](https://img.shields.io/travis/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://travis-ci.org/PetrochukM/PyTorch-NLP) -[![Twitter: PetrochukM](https://img.shields.io/twitter/follow/MPetrochuk.svg?style=social)](https://twitter.com/MPetrochuk) - -_Logo by [Chloe Yeo](http://www.yeochloe.com/), Corporate Sponsorship by [WellSaid Labs](https://wellsaidlabs.com/)_ - -## Installation 🐾 - -Make sure you have Python 3.6+ and PyTorch 1.0+. You can then install `pytorch-nlp` using -pip: - -```python -pip install pytorch-nlp -``` - -Or to install the latest code via: - -```python -pip install git+https://github.com/PetrochukM/PyTorch-NLP.git -``` - -## Docs - -The complete documentation for PyTorch-NLP is available -via [our ReadTheDocs website](https://pytorchnlp.readthedocs.io). - -## Get Started - -Within an NLP data pipeline, you'll want to implement these basic steps: - -### 1. Load your Data 🐿 - -Load the IMDB dataset, for example: - -```python -from torchnlp.datasets import imdb_dataset - -# Load the imdb training dataset -train = imdb_dataset(train=True) -train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'} -``` - -Load a custom dataset, for example: - -```python -from pathlib import Path - -from torchnlp.download import download_file_maybe_extract - -directory_path = Path('data/') -train_file_path = Path('trees/train.txt') - -download_file_maybe_extract( - url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip', - directory=directory_path, - check_files=[train_file_path]) - -open(directory_path / train_file_path) -``` - -Don't worry we'll handle caching for you! - -### 2. Text to Tensor - -Tokenize and encode your text as a tensor. - -For example, a `WhitespaceEncoder` breaks -text into tokens whenever it encounters a whitespace character. - -```python -from torchnlp.encoders.text import WhitespaceEncoder - -loaded_data = ["now this ain't funny", "so don't you dare laugh"] -encoder = WhitespaceEncoder(loaded_data) -encoded_data = [encoder.encode(example) for example in loaded_data] -``` - -### 3. Tensor to Batch - -With your loaded and encoded data in hand, you'll want to batch your dataset. - -```python -import torch -from torchnlp.samplers import BucketBatchSampler -from torchnlp.utils import collate_tensors -from torchnlp.encoders.text import stack_and_pad_tensors - -encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)] - -train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data) -train_batch_sampler = BucketBatchSampler( - train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0]) - -batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler] -batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches] -``` - -PyTorch-NLP builds on top of PyTorch's existing `torch.utils.data.sampler`, `torch.stack` -and `default_collate` to support sequential inputs of varying lengths! - -### 4. Training and Inference - -With your batch in hand, you can use PyTorch to develop and train your model using gradient descent. -For example, check out [this example code](examples/snli/train.py) for training on the Stanford -Natural Language Inference (SNLI) Corpus. - -## Last But Not Least - -PyTorch-NLP has a couple more NLP focused utility packages to support you! 🤗 - -### Deterministic Functions - -Now you've setup your pipeline, you may want to ensure that some functions run deterministically. -Wrap any code that's random, with `fork_rng` and you'll be good to go, like so: - -```python -import random -import numpy -import torch - -from torchnlp.random import fork_rng - -with fork_rng(seed=123): # Ensure determinism - print('Random:', random.randint(1, 2**31)) - print('Numpy:', numpy.random.randint(1, 2**31)) - print('Torch:', int(torch.randint(1, 2**31, (1,)))) -``` - -This will always print: - -```text -Random: 224899943 -Numpy: 843828735 -Torch: 843828736 -``` - -### Pre-Trained Word Vectors - -Now that you've computed your vocabulary, you may want to make use of -pre-trained word vectors to set your embeddings, like so: - -```python -import torch -from torchnlp.encoders.text import WhitespaceEncoder -from torchnlp.word_to_vector import GloVe - -encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"]) - -vocab_set = set(encoder.vocab) -pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set) -embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim) -for i, token in enumerate(encoder.vocab): - embedding_weights[i] = pretrained_embedding[token] -``` - -### Neural Networks Layers - -For example, from the neural network package, apply the state-of-the-art `LockedDropout`: - -```python -import torch -from torchnlp.nn import LockedDropout - -input_ = torch.randn(6, 3, 10) -dropout = LockedDropout(0.5) - -# Apply a LockedDropout to `input_` -dropout(input_) # RETURNS: torch.FloatTensor (6x3x10) -``` - -### Metrics - -Compute common NLP metrics such as the BLEU score. - -```python -from torchnlp.metrics import get_moses_multi_bleu - -hypotheses = ["The brown fox jumps over the dog 笑"] -references = ["The quick brown fox jumps over the lazy dog 笑"] - -# Compute BLEU score with the official BLEU perl script -get_moses_multi_bleu(hypotheses, references, lowercase=True) # RETURNS: 47.9 -``` - -### Help :question: - -Maybe looking at longer examples may help you at [`examples/`](examples/). - -Need more help? We are happy to answer your questions via [Gitter Chat](https://gitter.im/PyTorch-NLP) - -## Contributing - -We've released PyTorch-NLP because we found a lack of basic toolkits for NLP in PyTorch. We hope -that other organizations can benefit from the project. We are thankful for any contributions from -the community. - -### Contributing Guide - -Read our [contributing guide](https://github.com/PetrochukM/PyTorch-NLP/blob/master/CONTRIBUTING.md) -to learn about our development process, how to propose bugfixes and improvements, and how to build -and test your changes to PyTorch-NLP. - -## Related Work - -### [torchtext](https://github.com/pytorch/text) - -torchtext and PyTorch-NLP differ in the architecture and feature set; otherwise, they are similar. -torchtext and PyTorch-NLP provide pre-trained word vectors, datasets, iterators and text encoders. -PyTorch-NLP also provides neural network modules and metrics. From an architecture standpoint, -torchtext is object orientated with external coupling while PyTorch-NLP is object orientated with -low coupling. - -### [AllenNLP](https://github.com/allenai/allennlp) - -AllenNLP is designed to be a platform for research. PyTorch-NLP is designed to be a lightweight toolkit. - -## Authors - -- [Michael Petrochuk](https://github.com/PetrochukM/) — Developer -- [Chloe Yeo](http://www.yeochloe.com/) — Logo Design - -## Citing - -If you find PyTorch-NLP useful for an academic publication, then please use the following BibTeX to -cite it: - -``` -@misc{pytorch-nlp, - author = {Petrochuk, Michael}, - title = {PyTorch-NLP: Rapid Prototyping with PyTorch Natural Language Processing (NLP) Tools}, - year = {2018}, - publisher = {GitHub}, - journal = {GitHub repository}, - howpublished = {\url{https://github.com/PetrochukM/PyTorch-NLP}}, -} -``` From 0eacddb778950af81cc0e0df9515c2f62f806029 Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 <51365904+PriyaDeshpande1605@users.noreply.github.com> Date: Thu, 8 Apr 2021 11:33:28 +0530 Subject: [PATCH 5/8] Create README.md --- README.md | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..6624217 --- /dev/null +++ b/README.md @@ -0,0 +1,232 @@ +# PyTorch-NLP +Basic Utilities for PyTorch Natural Language Processing (NLP) +

+ +

Basic Utilities for PyTorch Natural Language Processing (NLP)

+ +PyTorch-NLP, or `torchnlp` for short, is a library of basic utilities for PyTorch +NLP. `torchnlp` extends PyTorch to provide you with +basic text data processing functions. + +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-nlp.svg?style=flat-square) +[![Codecov](https://img.shields.io/codecov/c/github/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://codecov.io/gh/PetrochukM/PyTorch-NLP) +[![Downloads](http://pepy.tech/badge/pytorch-nlp)](http://pepy.tech/project/pytorch-nlp) +[![Documentation Status](https://img.shields.io/readthedocs/pytorchnlp/latest.svg?style=flat-square)](http://pytorchnlp.readthedocs.io/en/latest/?badge=latest&style=flat-square) +[![Build Status](https://img.shields.io/travis/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://travis-ci.org/PetrochukM/PyTorch-NLP) +[![Twitter: PetrochukM](https://img.shields.io/twitter/follow/MPetrochuk.svg?style=social)](https://twitter.com/MPetrochuk) + +_Logo by [Chloe Yeo](http://www.yeochloe.com/), Corporate Sponsorship by [WellSaid Labs](https://wellsaidlabs.com/)_ + +## Installation 🐾 + +Make sure you have Python 3.6+ and PyTorch 1.0+. You can then install `pytorch-nlp` using +pip: + +```python +pip install pytorch-nlp +``` + +Or to install the latest code via: + +```python +pip install git+https://github.com/PetrochukM/PyTorch-NLP.git +``` + +## Docs + +The complete documentation for PyTorch-NLP is available +via [our ReadTheDocs website](https://pytorchnlp.readthedocs.io). + +## Get Started + +Within an NLP data pipeline, you'll want to implement these basic steps: + +### 1. Load your Data 🐿 + +Load the IMDB dataset, for example: + +```python +from torchnlp.datasets import imdb_dataset +# Load the imdb training dataset +train = imdb_dataset(train=True) +train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'} +``` + +Load a custom dataset, for example: + +```python +from pathlib import Path +from torchnlp.download import download_file_maybe_extract +directory_path = Path('data/') +train_file_path = Path('trees/train.txt') +download_file_maybe_extract( + url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip', + directory=directory_path, + check_files=[train_file_path]) +open(directory_path / train_file_path) +``` + +Don't worry we'll handle caching for you! + +### 2. Text to Tensor + +Tokenize and encode your text as a tensor. + +For example, a `WhitespaceEncoder` breaks +text into tokens whenever it encounters a whitespace character. + +```python +from torchnlp.encoders.text import WhitespaceEncoder +loaded_data = ["now this ain't funny", "so don't you dare laugh"] +encoder = WhitespaceEncoder(loaded_data) +encoded_data = [encoder.encode(example) for example in loaded_data] +``` + +### 3. Tensor to Batch + +With your loaded and encoded data in hand, you'll want to batch your dataset. + +```python +import torch +from torchnlp.samplers import BucketBatchSampler +from torchnlp.utils import collate_tensors +from torchnlp.encoders.text import stack_and_pad_tensors +encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)] +train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data) +train_batch_sampler = BucketBatchSampler( + train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0]) +batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler] +batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches] +``` + +PyTorch-NLP builds on top of PyTorch's existing `torch.utils.data.sampler`, `torch.stack` +and `default_collate` to support sequential inputs of varying lengths! + +### 4. Training and Inference + +With your batch in hand, you can use PyTorch to develop and train your model using gradient descent. +For example, check out [this example code](examples/snli/train.py) for training on the Stanford +Natural Language Inference (SNLI) Corpus. + +## Last But Not Least + +PyTorch-NLP has a couple more NLP focused utility packages to support you! 🤗 + +### Deterministic Functions + +Now you've setup your pipeline, you may want to ensure that some functions run deterministically. +Wrap any code that's random, with `fork_rng` and you'll be good to go, like so: + +```python +import random +import numpy +import torch +from torchnlp.random import fork_rng +with fork_rng(seed=123): # Ensure determinism + print('Random:', random.randint(1, 2**31)) + print('Numpy:', numpy.random.randint(1, 2**31)) + print('Torch:', int(torch.randint(1, 2**31, (1,)))) +``` + +This will always print: + +```text +Random: 224899943 +Numpy: 843828735 +Torch: 843828736 +``` + +### Pre-Trained Word Vectors + +Now that you've computed your vocabulary, you may want to make use of +pre-trained word vectors to set your embeddings, like so: + +```python +import torch +from torchnlp.encoders.text import WhitespaceEncoder +from torchnlp.word_to_vector import GloVe +encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"]) +vocab_set = set(encoder.vocab) +pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set) +embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim) +for i, token in enumerate(encoder.vocab): + embedding_weights[i] = pretrained_embedding[token] +``` + +### Neural Networks Layers + +For example, from the neural network package, apply the state-of-the-art `LockedDropout`: + +```python +import torch +from torchnlp.nn import LockedDropout +input_ = torch.randn(6, 3, 10) +dropout = LockedDropout(0.5) +# Apply a LockedDropout to `input_` +dropout(input_) # RETURNS: torch.FloatTensor (6x3x10) +``` + +### Metrics + +Compute common NLP metrics such as the BLEU score. + +```python +from torchnlp.metrics import get_moses_multi_bleu +hypotheses = ["The brown fox jumps over the dog 笑"] +references = ["The quick brown fox jumps over the lazy dog 笑"] +# Compute BLEU score with the official BLEU perl script +get_moses_multi_bleu(hypotheses, references, lowercase=True) # RETURNS: 47.9 +``` + +### Help :question: + +Maybe looking at longer examples may help you at [`examples/`](examples/). + +Need more help? We are happy to answer your questions via [Gitter Chat](https://gitter.im/PyTorch-NLP) + +## Contributing + +We've released PyTorch-NLP because we found a lack of basic toolkits for NLP in PyTorch. We hope +that other organizations can benefit from the project. We are thankful for any contributions from +the community. + +### Contributing Guide + +Read our [contributing guide](https://github.com/PetrochukM/PyTorch-NLP/blob/master/CONTRIBUTING.md) +to learn about our development process, how to propose bugfixes and improvements, and how to build +and test your changes to PyTorch-NLP. + +## Related Work + +### [torchtext](https://github.com/pytorch/text) + +torchtext and PyTorch-NLP differ in the architecture and feature set; otherwise, they are similar. +torchtext and PyTorch-NLP provide pre-trained word vectors, datasets, iterators and text encoders. +PyTorch-NLP also provides neural network modules and metrics. From an architecture standpoint, +torchtext is object orientated with external coupling while PyTorch-NLP is object orientated with +low coupling. + +### [AllenNLP](https://github.com/allenai/allennlp) + +AllenNLP is designed to be a platform for research. PyTorch-NLP is designed to be a lightweight toolkit. + +## Authors + +- [Michael Petrochuk](https://github.com/PetrochukM/) — Developer +- [Chloe Yeo](http://www.yeochloe.com/) — Logo Design + +## Citing + +If you find PyTorch-NLP useful for an academic publication, then please use the following BibTeX to +cite it: + +``` +@misc{pytorch-nlp, + author = {Petrochuk, Michael}, + title = {PyTorch-NLP: Rapid Prototyping with PyTorch Natural Language Processing (NLP) Tools}, + year = {2018}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/PetrochukM/PyTorch-NLP}}, +} +``` From 058a0d364a2569321a370d7407dc763b0adb5811 Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 Date: Thu, 8 Apr 2021 20:51:39 +0530 Subject: [PATCH 6/8] formatting the code --- torchnlp/metrics/rouge.py | 172 ++++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 82 deletions(-) diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py index 8ef760e..cf154f6 100644 --- a/torchnlp/metrics/rouge.py +++ b/torchnlp/metrics/rouge.py @@ -1,105 +1,113 @@ import itertools import numpy as np + def _get_ngrams(n, text): - - ngram_set = set() - text_length = len(text) - max_index_ngram_start = text_length - n - for i in range(max_index_ngram_start + 1): - ngram_set.add(tuple(text[i:i + n])) - return ngram_set + + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + def _get_word_ngrams(n, sentences): - """Calculates word n-grams for multiple sentences. - """ - assert len(sentences) > 0 - assert n > 0 + """Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = split_into_words(sentences) + return _get_ngrams(n, words) - words = split_into_words(sentences) - return _get_ngrams(n, words) def rouge_n(evaluated_sentences, reference_sentences, n=2): - - if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: - raise ValueError("Collections must contain at least 1 sentence.") - evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) - reference_ngrams = _get_word_ngrams(n, reference_sentences) - reference_count = len(reference_ngrams) - evaluated_count = len(evaluated_ngrams) + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") - # Gets the overlapping ngrams between evaluated and reference - overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) - overlapping_count = len(overlapping_ngrams) + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) - # Handle edge case. This isn't mathematically correct, but it's good enough - if evaluated_count == 0: - precision = 0.0 - else: - precision = overlapping_count / evaluated_count + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) - if reference_count == 0: - recall = 0.0 - else: - recall = overlapping_count / reference_count + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count - f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score - # return overlapping_count / reference_count - return f1_score def len_lcs(x, y): - - n, m = len(x), len(y) - table = dict() - for i in range(n + 1): - for j in range(m + 1): - if i == 0 or j == 0: - table[i, j] = 0 - elif x[i - 1] == y[j - 1]: - table[i, j] = table[i - 1, j - 1] + 1 - else: - table[i, j] = max(table[i - 1, j], table[i, j - 1]) - return table[n,m] + + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table[n, m] def split_into_words(sentences): - """Splits multiple sentences into words and flattens the result""" - return list(sentences.split(" ")) - - -def rogue_l( candidate, references ): - lcs = len_lcs ( candidate, references ) - len_x = len(candidate) - len_y = len(references) - - recall = lcs / len_y - precision = lcs / len_x - beta = precision/ (recall + 1e-12) - numerator = (1 + (beta ** 2 ) )* ( precision * recall ) - denominator = ( precision* ( beta ** 2 ) + recall ) + 1e-8 - f1_score = numerator/ denominator - return f1_score - -def average_rouge ( candidate, references ): - rouge_1 = rouge_n( candidate, references, 1 ) - rouge_2 = rouge_n( candidate, references, 2 ) - rouge_lcs = rogue_l( split_into_words(candidate), split_into_words(references) ) - avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3 - print("rouge_1:", rouge_1) - print("rouge_2:", rouge_2) - print("rouge_lcs:", rouge_lcs) - print("average:" ,avg_rouge) - + """Splits multiple sentences into words and flattens the result""" + return list(sentences.split(" ")) + + +def rogue_l(candidate, references): + lcs = len_lcs(candidate, references) + len_x = len(candidate) + len_y = len(references) + + recall = lcs / len_y + precision = lcs / len_x + beta = precision / (recall + 1e-12) + numerator = (1 + (beta ** 2)) * (precision * recall) + denominator = (precision * (beta ** 2) + recall) + 1e-8 + f1_score = numerator / denominator + return f1_score + + +def average_rouge(candidate, references): + rouge_1 = rouge_n(candidate, references, 1) + rouge_2 = rouge_n(candidate, references, 2) + rouge_lcs = rogue_l(split_into_words(candidate), + split_into_words(references)) + avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3 + print("rouge_1:", rouge_1) + print("rouge_2:", rouge_2) + print("rouge_lcs:", rouge_lcs) + print("average:", avg_rouge) + + def main(): - x = "The quick brown fox jumped over the wall" - y = "The fast black dog and fox jumped into the wall" - x_words = split_into_words(x) - y_words = split_into_words(y) - print(x_words) - lcs = len_lcs(x_words,y_words) - average_rouge(x, y ) - + x = "The quick brown fox jumped over the wall" + y = "The fast black dog and fox jumped into the wall" + x_words = split_into_words(x) + y_words = split_into_words(y) + print(x_words) + lcs = len_lcs(x_words, y_words) + average_rouge(x, y) + + if __name__ == "__main__": main() From 2a0c3434e2d545e7eb462cb873994f7fd8fd5c2f Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 Date: Thu, 8 Apr 2021 20:57:30 +0530 Subject: [PATCH 7/8] formatting the code --- torchnlp/metrics/rouge.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py index cf154f6..f7475b4 100644 --- a/torchnlp/metrics/rouge.py +++ b/torchnlp/metrics/rouge.py @@ -1,5 +1,3 @@ -import itertools -import numpy as np def _get_ngrams(n, text): @@ -106,6 +104,7 @@ def main(): y_words = split_into_words(y) print(x_words) lcs = len_lcs(x_words, y_words) + print(lcs) average_rouge(x, y) From 352e844f3d0650d077f18acadac5968d8bc430db Mon Sep 17 00:00:00 2001 From: PriyaDeshpande1605 Date: Thu, 8 Apr 2021 21:03:27 +0530 Subject: [PATCH 8/8] formatting the code --- torchnlp/metrics/rouge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py index f7475b4..2d997be 100644 --- a/torchnlp/metrics/rouge.py +++ b/torchnlp/metrics/rouge.py @@ -90,7 +90,7 @@ def average_rouge(candidate, references): rouge_2 = rouge_n(candidate, references, 2) rouge_lcs = rogue_l(split_into_words(candidate), split_into_words(references)) - avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3 + avg_rouge = (rouge_1 + rouge_2 + rouge_lcs) / 3 print("rouge_1:", rouge_1) print("rouge_2:", rouge_2) print("rouge_lcs:", rouge_lcs)