From dc1f2d90828542bee4ad8c0107d8907c5d841cdf Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605 <priya.deshpande.0599@gmail.com>
Date: Tue, 6 Apr 2021 14:32:50 +0530
Subject: [PATCH 1/8] Rogue metric added.

---
 torchnlp/metrics/rogue.py | 122 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 torchnlp/metrics/rogue.py

diff --git a/torchnlp/metrics/rogue.py b/torchnlp/metrics/rogue.py
new file mode 100644
index 0000000..a3b868f
--- /dev/null
+++ b/torchnlp/metrics/rogue.py
@@ -0,0 +1,122 @@
+from __future__ import division
+from itertools import chain
+
+
+def get_unigram_count(tokens): #frequency map
+    count_dict = dict()
+    for t in tokens:
+        if t in count_dict:
+            count_dict[t] += 1
+        else:
+            count_dict[t] = 1
+
+    return count_dict
+
+
+class Rouge:
+    beta = 1
+
+    @staticmethod
+    def my_lcs_grid(x, y):
+        n = len(x)
+        m = len(y)
+
+        table = [[0 for i in range(m + 1)] for j in range(n + 1)]
+
+        for j in range(m + 1):
+            for i in range(n + 1):
+                if i == 0 or j == 0:
+                    cell = (0, 'e')
+                elif x[i - 1] == y[j - 1]:
+                    cell = (table[i - 1][j - 1][0] + 1, '\\')
+                else:
+                    over = table[i - 1][j][0]
+                    left = table[i][j - 1][0]
+
+                    if left < over:
+                        cell = (over, '^')
+                    else:
+                        cell = (left, '<')
+
+                table[i][j] = cell
+        print(table)
+        return table
+
+    @staticmethod
+    def my_lcs(x, y, mask_x):
+        table = Rouge.my_lcs_grid(x, y)
+        i = len(x)
+        j = len(y)
+
+        while i > 0 and j > 0:
+            move = table[i][j][1]
+            if move == '\\':
+                mask_x[i - 1] = 1
+                i -= 1
+                j -= 1
+            elif move == '^':
+                i -= 1
+            elif move == '<':
+                j -= 1
+        print(mask_x)
+        return mask_x
+
+    @staticmethod
+    def rouge_l(cand_sents, ref_sents):
+        lcs_scores = 0.0
+        cand_unigrams = get_unigram_count(chain(*cand_sents))
+        ref_unigrams = get_unigram_count(chain(*ref_sents))
+        for cand_sent in cand_sents:
+            cand_token_mask = [0 for t in cand_sent]
+            cand_len = len(cand_sent)
+            for ref_sent in ref_sents:
+                # aligns = []
+                # Rouge.lcs(ref_sent, cand_sent, aligns)
+                Rouge.my_lcs(cand_sent, ref_sent, cand_token_mask)
+
+                # for i in aligns:
+                #     ref_token_mask[i] = 1
+            # lcs = []
+            cur_lcs_score = 0.0
+            for i in range(cand_len):
+                if cand_token_mask[i]:
+                    token = cand_sent[i]
+                    if cand_unigrams[token] > 0 and ref_unigrams[token] > 0:
+                        cand_unigrams[token] -= 1
+                        ref_unigrams[token] -= 1
+                        cur_lcs_score += 1
+
+                        # lcs.append(token)
+
+            # print ' '.join(lcs)
+
+            lcs_scores += cur_lcs_score
+
+        # print "lcs_scores: %d" % lcs_scores
+        ref_words_count = sum(len(s) for s in ref_sents)
+        # print "ref_words_count: %d" % ref_words_count
+        cand_words_count = sum(len(s) for s in cand_sents)
+        # print "cand_words_count: %d" % cand_words_count
+
+        precision = lcs_scores / cand_words_count
+        recall = lcs_scores / ref_words_count
+        f_score = (1 + Rouge.beta ** 2) * precision * recall / (recall +
+                                                                Rouge.beta ** 2 * precision + 1e-7) + 1e-6  # prevent underflow
+        return precision, recall, f_score
+
+    # @staticmethod
+    # def rouge_2(cand_sents, ref_sents):
+    #     cand_bigram_counts = get_bigram_counts(cand_sents)
+    #     ref_bigram_counts = get_bigram_counts(ref_sents)
+
+
+if __name__ == '__main__':
+    r = Rouge()
+    # A simple eample of how rouge can be calculated
+    print(r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]],
+                    [[1, 2, 3, 4, 5], [3, 9, 5]]))
+
+    # A more practical example of how it can be used for summary evaluation
+    system_generated_summary = "The quick fox jumped over the fence"
+    manual_summmary = "The fast brown fox jumped over the wall"
+    print(r.rouge_l([system_generated_summary], [manual_summmary]))
\ No newline at end of file

From f82c5c267284ce8b2f94a6171ac31f41d1e3285d Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605
 <51365904+PriyaDeshpande1605@users.noreply.github.com>
Date: Wed, 7 Apr 2021 20:41:05 +0530
Subject: [PATCH 2/8] Updated rouge.py

---
 torchnlp/metrics/rogue.py | 122 --------------------------------------
 torchnlp/metrics/rouge.py | 105 ++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 122 deletions(-)
 delete mode 100644 torchnlp/metrics/rogue.py
 create mode 100644 torchnlp/metrics/rouge.py

diff --git a/torchnlp/metrics/rogue.py b/torchnlp/metrics/rogue.py
deleted file mode 100644
index a3b868f..0000000
--- a/torchnlp/metrics/rogue.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from __future__ import division
-from itertools import chain
-
-
-def get_unigram_count(tokens): #frequency map
-    count_dict = dict()
-    for t in tokens:
-        if t in count_dict:
-            count_dict[t] += 1
-        else:
-            count_dict[t] = 1
-
-    return count_dict
-
-
-class Rouge:
-    beta = 1
-
-    @staticmethod
-    def my_lcs_grid(x, y):
-        n = len(x)
-        m = len(y)
-
-        table = [[0 for i in range(m + 1)] for j in range(n + 1)]
-
-        for j in range(m + 1):
-            for i in range(n + 1):
-                if i == 0 or j == 0:
-                    cell = (0, 'e')
-                elif x[i - 1] == y[j - 1]:
-                    cell = (table[i - 1][j - 1][0] + 1, '\\')
-                else:
-                    over = table[i - 1][j][0]
-                    left = table[i][j - 1][0]
-
-                    if left < over:
-                        cell = (over, '^')
-                    else:
-                        cell = (left, '<')
-
-                table[i][j] = cell
-        print(table)
-        return table
-
-    @staticmethod
-    def my_lcs(x, y, mask_x):
-        table = Rouge.my_lcs_grid(x, y)
-        i = len(x)
-        j = len(y)
-
-        while i > 0 and j > 0:
-            move = table[i][j][1]
-            if move == '\\':
-                mask_x[i - 1] = 1
-                i -= 1
-                j -= 1
-            elif move == '^':
-                i -= 1
-            elif move == '<':
-                j -= 1
-        print(mask_x)
-        return mask_x
-
-    @staticmethod
-    def rouge_l(cand_sents, ref_sents):
-        lcs_scores = 0.0
-        cand_unigrams = get_unigram_count(chain(*cand_sents))
-        ref_unigrams = get_unigram_count(chain(*ref_sents))
-        for cand_sent in cand_sents:
-            cand_token_mask = [0 for t in cand_sent]
-            cand_len = len(cand_sent)
-            for ref_sent in ref_sents:
-                # aligns = []
-                # Rouge.lcs(ref_sent, cand_sent, aligns)
-                Rouge.my_lcs(cand_sent, ref_sent, cand_token_mask)
-
-                # for i in aligns:
-                #     ref_token_mask[i] = 1
-            # lcs = []
-            cur_lcs_score = 0.0
-            for i in range(cand_len):
-                if cand_token_mask[i]:
-                    token = cand_sent[i]
-                    if cand_unigrams[token] > 0 and ref_unigrams[token] > 0:
-                        cand_unigrams[token] -= 1
-                        ref_unigrams[token] -= 1
-                        cur_lcs_score += 1
-
-                        # lcs.append(token)
-
-            # print ' '.join(lcs)
-
-            lcs_scores += cur_lcs_score
-
-        # print "lcs_scores: %d" % lcs_scores
-        ref_words_count = sum(len(s) for s in ref_sents)
-        # print "ref_words_count: %d" % ref_words_count
-        cand_words_count = sum(len(s) for s in cand_sents)
-        # print "cand_words_count: %d" % cand_words_count
-
-        precision = lcs_scores / cand_words_count
-        recall = lcs_scores / ref_words_count
-        f_score = (1 + Rouge.beta ** 2) * precision * recall / (recall +
-                                                                Rouge.beta ** 2 * precision + 1e-7) + 1e-6  # prevent underflow
-        return precision, recall, f_score
-
-    # @staticmethod
-    # def rouge_2(cand_sents, ref_sents):
-    #     cand_bigram_counts = get_bigram_counts(cand_sents)
-    #     ref_bigram_counts = get_bigram_counts(ref_sents)
-
-
-if __name__ == '__main__':
-    r = Rouge()
-    # A simple eample of how rouge can be calculated
-    print(r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]],
-                    [[1, 2, 3, 4, 5], [3, 9, 5]]))
-
-    # A more practical example of how it can be used for summary evaluation
-    system_generated_summary = "The quick fox jumped over the fence"
-    manual_summmary = "The fast brown fox jumped over the wall"
-    print(r.rouge_l([system_generated_summary], [manual_summmary]))
\ No newline at end of file
diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
new file mode 100644
index 0000000..9aacc37
--- /dev/null
+++ b/torchnlp/metrics/rouge.py
@@ -0,0 +1,105 @@
+import itertools
+import numpy as np
+
+def _get_ngrams(n, text):
+  
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+def _get_word_ngrams(n, sentences):
+  """Calculates word n-grams for multiple sentences.
+  """
+  assert len(sentences) > 0
+  assert n > 0
+
+  words = split_into_words(sentences)
+  return _get_ngrams(n, words)
+
+def rouge_n(evaluated_sentences, reference_sentences, n=2):
+  
+  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+
+  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+  reference_ngrams = _get_word_ngrams(n, reference_sentences)
+  reference_count = len(reference_ngrams)
+  evaluated_count = len(evaluated_ngrams)
+
+  # Gets the overlapping ngrams between evaluated and reference
+  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+  overlapping_count = len(overlapping_ngrams)
+
+  # Handle edge case. This isn't mathematically correct, but it's good enough
+  if evaluated_count == 0:
+    precision = 0.0
+  else:
+    precision = overlapping_count / evaluated_count
+
+  if reference_count == 0:
+    recall = 0.0
+  else:
+    recall = overlapping_count / reference_count
+
+  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+  # return overlapping_count / reference_count
+  return f1_score 
+
+def len_lcs(x, y):
+  
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table[n,m]
+
+
+def split_into_words(sentences):
+  """Splits multiple sentences into words and flattens the result"""
+  return list(sentences.split(" "))
+
+
+def rogue_l( candidate, references  ):
+  lcs = len_lcs ( candidate, references )
+  len_x = len(candidate)
+  len_y = len(references)
+
+  recall = lcs / len_y
+  precision = lcs / len_x 
+  beta = precision/ (recall + 1e-12)
+  numerator = (1 + (beta ** 2 ) )* (  precision * recall )
+  denominator = ( precision* ( beta ** 2 ) + recall  ) + 1e-8
+  f1_score = numerator/ denominator 
+  return f1_score
+
+def average_rouge ( candidate, references ):
+  rouge_1 = rouge_n( candidate, references, 1 )
+  rouge_2 = rouge_n( candidate, references, 2 )
+  rouge_lcs = rogue_l( split_into_words(candidate), split_into_words(references) )
+  avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3
+  print("rouge_1:", rouge_1)
+  print("rouge_2:", rouge_2)
+  print("rouge_lcs:", rouge_lcs)
+  print("average:" ,avg_rouge)
+    
+def main():
+    x = "The quick brown fox jumped over the wall"
+    y = "The fast black dog and fox jumped into the wall"
+    x_words = split_into_words(x)
+    y_words = split_into_words(y)
+    print(x_words)
+    lcs = len_lcs(x_words,y_words)
+    average_rouge(x, y )
+    
+if __main__ == "main":
+    main()

From 92dee4c70e5ec6a408be117e3e3ebc40fc72c69d Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605
 <51365904+PriyaDeshpande1605@users.noreply.github.com>
Date: Wed, 7 Apr 2021 21:58:51 +0530
Subject: [PATCH 3/8] Update rouge.py

---
 torchnlp/metrics/rouge.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
index 9aacc37..8ef760e 100644
--- a/torchnlp/metrics/rouge.py
+++ b/torchnlp/metrics/rouge.py
@@ -93,13 +93,13 @@ def average_rouge ( candidate, references ):
   print("average:" ,avg_rouge)
     
 def main():
-    x = "The quick brown fox jumped over the wall"
-    y = "The fast black dog and fox jumped into the wall"
-    x_words = split_into_words(x)
-    y_words = split_into_words(y)
-    print(x_words)
-    lcs = len_lcs(x_words,y_words)
-    average_rouge(x, y )
-    
-if __main__ == "main":
+	x = "The quick brown fox jumped over the wall"
+	y = "The fast black dog and fox jumped into the wall"
+	x_words = split_into_words(x)
+	y_words = split_into_words(y)
+	print(x_words)
+	lcs = len_lcs(x_words,y_words)
+	average_rouge(x, y )
+   
+if __name__ == "__main__":
     main()

From 6f4d00c68516a1497505962b85977abba8f5ff00 Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605
 <51365904+PriyaDeshpande1605@users.noreply.github.com>
Date: Thu, 8 Apr 2021 11:11:14 +0530
Subject: [PATCH 4/8] Delete README.md

Trying to fix a build issue
---
 README.md | 247 ------------------------------------------------------
 1 file changed, 247 deletions(-)
 delete mode 100755 README.md

diff --git a/README.md b/README.md
deleted file mode 100755
index 29ed58d..0000000
--- a/README.md
+++ /dev/null
@@ -1,247 +0,0 @@
-<p align="center"><img width="55%" src="docs/_static/img/logo.svg" /></p>
-
-<h3 align="center">Basic Utilities for PyTorch Natural Language Processing (NLP)</h3>
-
-PyTorch-NLP, or `torchnlp` for short, is a library of basic utilities for PyTorch
-NLP. `torchnlp` extends PyTorch to provide you with
-basic text data processing functions.
-
-![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-nlp.svg?style=flat-square)
-[![Codecov](https://img.shields.io/codecov/c/github/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://codecov.io/gh/PetrochukM/PyTorch-NLP)
-[![Downloads](http://pepy.tech/badge/pytorch-nlp)](http://pepy.tech/project/pytorch-nlp)
-[![Documentation Status](https://img.shields.io/readthedocs/pytorchnlp/latest.svg?style=flat-square)](http://pytorchnlp.readthedocs.io/en/latest/?badge=latest&style=flat-square)
-[![Build Status](https://img.shields.io/travis/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://travis-ci.org/PetrochukM/PyTorch-NLP)
-[![Twitter: PetrochukM](https://img.shields.io/twitter/follow/MPetrochuk.svg?style=social)](https://twitter.com/MPetrochuk)
-
-_Logo by [Chloe Yeo](http://www.yeochloe.com/), Corporate Sponsorship by [WellSaid Labs](https://wellsaidlabs.com/)_
-
-## Installation 🐾
-
-Make sure you have Python 3.6+ and PyTorch 1.0+. You can then install `pytorch-nlp` using
-pip:
-
-```python
-pip install pytorch-nlp
-```
-
-Or to install the latest code via:
-
-```python
-pip install git+https://github.com/PetrochukM/PyTorch-NLP.git
-```
-
-## Docs
-
-The complete documentation for PyTorch-NLP is available
-via [our ReadTheDocs website](https://pytorchnlp.readthedocs.io).
-
-## Get Started
-
-Within an NLP data pipeline, you'll want to implement these basic steps:
-
-### 1. Load your Data 🐿
-
-Load the IMDB dataset, for example:
-
-```python
-from torchnlp.datasets import imdb_dataset
-
-# Load the imdb training dataset
-train = imdb_dataset(train=True)
-train[0]  # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
-```
-
-Load a custom dataset, for example:
-
-```python
-from pathlib import Path
-
-from torchnlp.download import download_file_maybe_extract
-
-directory_path = Path('data/')
-train_file_path = Path('trees/train.txt')
-
-download_file_maybe_extract(
-    url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
-    directory=directory_path,
-    check_files=[train_file_path])
-
-open(directory_path / train_file_path)
-```
-
-Don't worry we'll handle caching for you!
-
-### 2. Text to Tensor
-
-Tokenize and encode your text as a tensor.
-
-For example, a `WhitespaceEncoder` breaks
-text into tokens whenever it encounters a whitespace character.
-
-```python
-from torchnlp.encoders.text import WhitespaceEncoder
-
-loaded_data = ["now this ain't funny", "so don't you dare laugh"]
-encoder = WhitespaceEncoder(loaded_data)
-encoded_data = [encoder.encode(example) for example in loaded_data]
-```
-
-### 3. Tensor to Batch
-
-With your loaded and encoded data in hand, you'll want to batch your dataset.
-
-```python
-import torch
-from torchnlp.samplers import BucketBatchSampler
-from torchnlp.utils import collate_tensors
-from torchnlp.encoders.text import stack_and_pad_tensors
-
-encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]
-
-train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
-train_batch_sampler = BucketBatchSampler(
-    train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0])
-
-batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
-batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches]
-```
-
-PyTorch-NLP builds on top of PyTorch's existing `torch.utils.data.sampler`, `torch.stack`
-and `default_collate` to support sequential inputs of varying lengths!
-
-### 4. Training and Inference
-
-With your batch in hand, you can use PyTorch to develop and train your model using gradient descent.
-For example, check out [this example code](examples/snli/train.py) for training on the Stanford
-Natural Language Inference (SNLI) Corpus.
-
-## Last But Not Least
-
-PyTorch-NLP has a couple more NLP focused utility packages to support you! 🤗
-
-### Deterministic Functions
-
-Now you've setup your pipeline, you may want to ensure that some functions run deterministically.
-Wrap any code that's random, with `fork_rng` and you'll be good to go, like so:
-
-```python
-import random
-import numpy
-import torch
-
-from torchnlp.random import fork_rng
-
-with fork_rng(seed=123):  # Ensure determinism
-    print('Random:', random.randint(1, 2**31))
-    print('Numpy:', numpy.random.randint(1, 2**31))
-    print('Torch:', int(torch.randint(1, 2**31, (1,))))
-```
-
-This will always print:
-
-```text
-Random: 224899943
-Numpy: 843828735
-Torch: 843828736
-```
-
-### Pre-Trained Word Vectors
-
-Now that you've computed your vocabulary, you may want to make use of
-pre-trained word vectors to set your embeddings, like so:
-
-```python
-import torch
-from torchnlp.encoders.text import WhitespaceEncoder
-from torchnlp.word_to_vector import GloVe
-
-encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"])
-
-vocab_set = set(encoder.vocab)
-pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set)
-embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim)
-for i, token in enumerate(encoder.vocab):
-    embedding_weights[i] = pretrained_embedding[token]
-```
-
-### Neural Networks Layers
-
-For example, from the neural network package, apply the state-of-the-art `LockedDropout`:
-
-```python
-import torch
-from torchnlp.nn import LockedDropout
-
-input_ = torch.randn(6, 3, 10)
-dropout = LockedDropout(0.5)
-
-# Apply a LockedDropout to `input_`
-dropout(input_) # RETURNS: torch.FloatTensor (6x3x10)
-```
-
-### Metrics
-
-Compute common NLP metrics such as the BLEU score.
-
-```python
-from torchnlp.metrics import get_moses_multi_bleu
-
-hypotheses = ["The brown fox jumps over the dog 笑"]
-references = ["The quick brown fox jumps over the lazy dog 笑"]
-
-# Compute BLEU score with the official BLEU perl script
-get_moses_multi_bleu(hypotheses, references, lowercase=True)  # RETURNS: 47.9
-```
-
-### Help :question:
-
-Maybe looking at longer examples may help you at [`examples/`](examples/).
-
-Need more help? We are happy to answer your questions via [Gitter Chat](https://gitter.im/PyTorch-NLP)
-
-## Contributing
-
-We've released PyTorch-NLP because we found a lack of basic toolkits for NLP in PyTorch. We hope
-that other organizations can benefit from the project. We are thankful for any contributions from
-the community.
-
-### Contributing Guide
-
-Read our [contributing guide](https://github.com/PetrochukM/PyTorch-NLP/blob/master/CONTRIBUTING.md)
-to learn about our development process, how to propose bugfixes and improvements, and how to build
-and test your changes to PyTorch-NLP.
-
-## Related Work
-
-### [torchtext](https://github.com/pytorch/text)
-
-torchtext and PyTorch-NLP differ in the architecture and feature set; otherwise, they are similar.
-torchtext and PyTorch-NLP provide pre-trained word vectors, datasets, iterators and text encoders.
-PyTorch-NLP also provides neural network modules and metrics. From an architecture standpoint,
-torchtext is object orientated with external coupling while PyTorch-NLP is object orientated with
-low coupling.
-
-### [AllenNLP](https://github.com/allenai/allennlp)
-
-AllenNLP is designed to be a platform for research. PyTorch-NLP is designed to be a lightweight toolkit.
-
-## Authors
-
-- [Michael Petrochuk](https://github.com/PetrochukM/) — Developer
-- [Chloe Yeo](http://www.yeochloe.com/) — Logo Design
-
-## Citing
-
-If you find PyTorch-NLP useful for an academic publication, then please use the following BibTeX to
-cite it:
-
-```
-@misc{pytorch-nlp,
-  author = {Petrochuk, Michael},
-  title = {PyTorch-NLP: Rapid Prototyping with PyTorch Natural Language Processing (NLP) Tools},
-  year = {2018},
-  publisher = {GitHub},
-  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/PetrochukM/PyTorch-NLP}},
-}
-```

From 0eacddb778950af81cc0e0df9515c2f62f806029 Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605
 <51365904+PriyaDeshpande1605@users.noreply.github.com>
Date: Thu, 8 Apr 2021 11:33:28 +0530
Subject: [PATCH 5/8] Create README.md

---
 README.md | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6624217
--- /dev/null
+++ b/README.md
@@ -0,0 +1,232 @@
+# PyTorch-NLP
+Basic Utilities for PyTorch Natural Language Processing (NLP)
+<p align="center"><img width="55%" src="docs/_static/img/logo.svg" /></p>
+
+<h3 align="center">Basic Utilities for PyTorch Natural Language Processing (NLP)</h3>
+
+PyTorch-NLP, or `torchnlp` for short, is a library of basic utilities for PyTorch
+NLP. `torchnlp` extends PyTorch to provide you with
+basic text data processing functions.
+
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pytorch-nlp.svg?style=flat-square)
+[![Codecov](https://img.shields.io/codecov/c/github/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://codecov.io/gh/PetrochukM/PyTorch-NLP)
+[![Downloads](http://pepy.tech/badge/pytorch-nlp)](http://pepy.tech/project/pytorch-nlp)
+[![Documentation Status](https://img.shields.io/readthedocs/pytorchnlp/latest.svg?style=flat-square)](http://pytorchnlp.readthedocs.io/en/latest/?badge=latest&style=flat-square)
+[![Build Status](https://img.shields.io/travis/PetrochukM/PyTorch-NLP/master.svg?style=flat-square)](https://travis-ci.org/PetrochukM/PyTorch-NLP)
+[![Twitter: PetrochukM](https://img.shields.io/twitter/follow/MPetrochuk.svg?style=social)](https://twitter.com/MPetrochuk)
+
+_Logo by [Chloe Yeo](http://www.yeochloe.com/), Corporate Sponsorship by [WellSaid Labs](https://wellsaidlabs.com/)_
+
+## Installation 🐾
+
+Make sure you have Python 3.6+ and PyTorch 1.0+. You can then install `pytorch-nlp` using
+pip:
+
+```python
+pip install pytorch-nlp
+```
+
+Or to install the latest code via:
+
+```python
+pip install git+https://github.com/PetrochukM/PyTorch-NLP.git
+```
+
+## Docs
+
+The complete documentation for PyTorch-NLP is available
+via [our ReadTheDocs website](https://pytorchnlp.readthedocs.io).
+
+## Get Started
+
+Within an NLP data pipeline, you'll want to implement these basic steps:
+
+### 1. Load your Data 🐿
+
+Load the IMDB dataset, for example:
+
+```python
+from torchnlp.datasets import imdb_dataset
+# Load the imdb training dataset
+train = imdb_dataset(train=True)
+train[0]  # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
+```
+
+Load a custom dataset, for example:
+
+```python
+from pathlib import Path
+from torchnlp.download import download_file_maybe_extract
+directory_path = Path('data/')
+train_file_path = Path('trees/train.txt')
+download_file_maybe_extract(
+    url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
+    directory=directory_path,
+    check_files=[train_file_path])
+open(directory_path / train_file_path)
+```
+
+Don't worry we'll handle caching for you!
+
+### 2. Text to Tensor
+
+Tokenize and encode your text as a tensor.
+
+For example, a `WhitespaceEncoder` breaks
+text into tokens whenever it encounters a whitespace character.
+
+```python
+from torchnlp.encoders.text import WhitespaceEncoder
+loaded_data = ["now this ain't funny", "so don't you dare laugh"]
+encoder = WhitespaceEncoder(loaded_data)
+encoded_data = [encoder.encode(example) for example in loaded_data]
+```
+
+### 3. Tensor to Batch
+
+With your loaded and encoded data in hand, you'll want to batch your dataset.
+
+```python
+import torch
+from torchnlp.samplers import BucketBatchSampler
+from torchnlp.utils import collate_tensors
+from torchnlp.encoders.text import stack_and_pad_tensors
+encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]
+train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
+train_batch_sampler = BucketBatchSampler(
+    train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0])
+batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
+batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches]
+```
+
+PyTorch-NLP builds on top of PyTorch's existing `torch.utils.data.sampler`, `torch.stack`
+and `default_collate` to support sequential inputs of varying lengths!
+
+### 4. Training and Inference
+
+With your batch in hand, you can use PyTorch to develop and train your model using gradient descent.
+For example, check out [this example code](examples/snli/train.py) for training on the Stanford
+Natural Language Inference (SNLI) Corpus.
+
+## Last But Not Least
+
+PyTorch-NLP has a couple more NLP focused utility packages to support you! 🤗
+
+### Deterministic Functions
+
+Now you've setup your pipeline, you may want to ensure that some functions run deterministically.
+Wrap any code that's random, with `fork_rng` and you'll be good to go, like so:
+
+```python
+import random
+import numpy
+import torch
+from torchnlp.random import fork_rng
+with fork_rng(seed=123):  # Ensure determinism
+    print('Random:', random.randint(1, 2**31))
+    print('Numpy:', numpy.random.randint(1, 2**31))
+    print('Torch:', int(torch.randint(1, 2**31, (1,))))
+```
+
+This will always print:
+
+```text
+Random: 224899943
+Numpy: 843828735
+Torch: 843828736
+```
+
+### Pre-Trained Word Vectors
+
+Now that you've computed your vocabulary, you may want to make use of
+pre-trained word vectors to set your embeddings, like so:
+
+```python
+import torch
+from torchnlp.encoders.text import WhitespaceEncoder
+from torchnlp.word_to_vector import GloVe
+encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"])
+vocab_set = set(encoder.vocab)
+pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set)
+embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim)
+for i, token in enumerate(encoder.vocab):
+    embedding_weights[i] = pretrained_embedding[token]
+```
+
+### Neural Networks Layers
+
+For example, from the neural network package, apply the state-of-the-art `LockedDropout`:
+
+```python
+import torch
+from torchnlp.nn import LockedDropout
+input_ = torch.randn(6, 3, 10)
+dropout = LockedDropout(0.5)
+# Apply a LockedDropout to `input_`
+dropout(input_) # RETURNS: torch.FloatTensor (6x3x10)
+```
+
+### Metrics
+
+Compute common NLP metrics such as the BLEU score.
+
+```python
+from torchnlp.metrics import get_moses_multi_bleu
+hypotheses = ["The brown fox jumps over the dog 笑"]
+references = ["The quick brown fox jumps over the lazy dog 笑"]
+# Compute BLEU score with the official BLEU perl script
+get_moses_multi_bleu(hypotheses, references, lowercase=True)  # RETURNS: 47.9
+```
+
+### Help :question:
+
+Maybe looking at longer examples may help you at [`examples/`](examples/).
+
+Need more help? We are happy to answer your questions via [Gitter Chat](https://gitter.im/PyTorch-NLP)
+
+## Contributing
+
+We've released PyTorch-NLP because we found a lack of basic toolkits for NLP in PyTorch. We hope
+that other organizations can benefit from the project. We are thankful for any contributions from
+the community.
+
+### Contributing Guide
+
+Read our [contributing guide](https://github.com/PetrochukM/PyTorch-NLP/blob/master/CONTRIBUTING.md)
+to learn about our development process, how to propose bugfixes and improvements, and how to build
+and test your changes to PyTorch-NLP.
+
+## Related Work
+
+### [torchtext](https://github.com/pytorch/text)
+
+torchtext and PyTorch-NLP differ in the architecture and feature set; otherwise, they are similar.
+torchtext and PyTorch-NLP provide pre-trained word vectors, datasets, iterators and text encoders.
+PyTorch-NLP also provides neural network modules and metrics. From an architecture standpoint,
+torchtext is object orientated with external coupling while PyTorch-NLP is object orientated with
+low coupling.
+
+### [AllenNLP](https://github.com/allenai/allennlp)
+
+AllenNLP is designed to be a platform for research. PyTorch-NLP is designed to be a lightweight toolkit.
+
+## Authors
+
+- [Michael Petrochuk](https://github.com/PetrochukM/) — Developer
+- [Chloe Yeo](http://www.yeochloe.com/) — Logo Design
+
+## Citing
+
+If you find PyTorch-NLP useful for an academic publication, then please use the following BibTeX to
+cite it:
+
+```
+@misc{pytorch-nlp,
+  author = {Petrochuk, Michael},
+  title = {PyTorch-NLP: Rapid Prototyping with PyTorch Natural Language Processing (NLP) Tools},
+  year = {2018},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/PetrochukM/PyTorch-NLP}},
+}
+```

From 058a0d364a2569321a370d7407dc763b0adb5811 Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605 <priya.deshpande.1605@gmail.com>
Date: Thu, 8 Apr 2021 20:51:39 +0530
Subject: [PATCH 6/8] formatting the code

---
 torchnlp/metrics/rouge.py | 172 ++++++++++++++++++++------------------
 1 file changed, 90 insertions(+), 82 deletions(-)

diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
index 8ef760e..cf154f6 100644
--- a/torchnlp/metrics/rouge.py
+++ b/torchnlp/metrics/rouge.py
@@ -1,105 +1,113 @@
 import itertools
 import numpy as np
 
+
 def _get_ngrams(n, text):
-  
-  ngram_set = set()
-  text_length = len(text)
-  max_index_ngram_start = text_length - n
-  for i in range(max_index_ngram_start + 1):
-    ngram_set.add(tuple(text[i:i + n]))
-  return ngram_set
+
+    ngram_set = set()
+    text_length = len(text)
+    max_index_ngram_start = text_length - n
+    for i in range(max_index_ngram_start + 1):
+        ngram_set.add(tuple(text[i:i + n]))
+    return ngram_set
+
 
 def _get_word_ngrams(n, sentences):
-  """Calculates word n-grams for multiple sentences.
-  """
-  assert len(sentences) > 0
-  assert n > 0
+    """Calculates word n-grams for multiple sentences.
+    """
+    assert len(sentences) > 0
+    assert n > 0
+
+    words = split_into_words(sentences)
+    return _get_ngrams(n, words)
 
-  words = split_into_words(sentences)
-  return _get_ngrams(n, words)
 
 def rouge_n(evaluated_sentences, reference_sentences, n=2):
-  
-  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
-    raise ValueError("Collections must contain at least 1 sentence.")
 
-  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
-  reference_ngrams = _get_word_ngrams(n, reference_sentences)
-  reference_count = len(reference_ngrams)
-  evaluated_count = len(evaluated_ngrams)
+    if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+        raise ValueError("Collections must contain at least 1 sentence.")
 
-  # Gets the overlapping ngrams between evaluated and reference
-  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
-  overlapping_count = len(overlapping_ngrams)
+    evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+    reference_ngrams = _get_word_ngrams(n, reference_sentences)
+    reference_count = len(reference_ngrams)
+    evaluated_count = len(evaluated_ngrams)
 
-  # Handle edge case. This isn't mathematically correct, but it's good enough
-  if evaluated_count == 0:
-    precision = 0.0
-  else:
-    precision = overlapping_count / evaluated_count
+    # Gets the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+    overlapping_count = len(overlapping_ngrams)
 
-  if reference_count == 0:
-    recall = 0.0
-  else:
-    recall = overlapping_count / reference_count
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if evaluated_count == 0:
+        precision = 0.0
+    else:
+        precision = overlapping_count / evaluated_count
 
-  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+    if reference_count == 0:
+        recall = 0.0
+    else:
+        recall = overlapping_count / reference_count
+
+    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+    # return overlapping_count / reference_count
+    return f1_score
 
-  # return overlapping_count / reference_count
-  return f1_score 
 
 def len_lcs(x, y):
-  
-  n, m = len(x), len(y)
-  table = dict()
-  for i in range(n + 1):
-    for j in range(m + 1):
-      if i == 0 or j == 0:
-        table[i, j] = 0
-      elif x[i - 1] == y[j - 1]:
-        table[i, j] = table[i - 1, j - 1] + 1
-      else:
-        table[i, j] = max(table[i - 1, j], table[i, j - 1])
-  return table[n,m]
+
+    n, m = len(x), len(y)
+    table = dict()
+    for i in range(n + 1):
+        for j in range(m + 1):
+            if i == 0 or j == 0:
+                table[i, j] = 0
+            elif x[i - 1] == y[j - 1]:
+                table[i, j] = table[i - 1, j - 1] + 1
+            else:
+                table[i, j] = max(table[i - 1, j], table[i, j - 1])
+    return table[n, m]
 
 
 def split_into_words(sentences):
-  """Splits multiple sentences into words and flattens the result"""
-  return list(sentences.split(" "))
-
-
-def rogue_l( candidate, references  ):
-  lcs = len_lcs ( candidate, references )
-  len_x = len(candidate)
-  len_y = len(references)
-
-  recall = lcs / len_y
-  precision = lcs / len_x 
-  beta = precision/ (recall + 1e-12)
-  numerator = (1 + (beta ** 2 ) )* (  precision * recall )
-  denominator = ( precision* ( beta ** 2 ) + recall  ) + 1e-8
-  f1_score = numerator/ denominator 
-  return f1_score
-
-def average_rouge ( candidate, references ):
-  rouge_1 = rouge_n( candidate, references, 1 )
-  rouge_2 = rouge_n( candidate, references, 2 )
-  rouge_lcs = rogue_l( split_into_words(candidate), split_into_words(references) )
-  avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3
-  print("rouge_1:", rouge_1)
-  print("rouge_2:", rouge_2)
-  print("rouge_lcs:", rouge_lcs)
-  print("average:" ,avg_rouge)
-    
+    """Splits multiple sentences into words and flattens the result"""
+    return list(sentences.split(" "))
+
+
+def rogue_l(candidate, references):
+    lcs = len_lcs(candidate, references)
+    len_x = len(candidate)
+    len_y = len(references)
+
+    recall = lcs / len_y
+    precision = lcs / len_x
+    beta = precision / (recall + 1e-12)
+    numerator = (1 + (beta ** 2)) * (precision * recall)
+    denominator = (precision * (beta ** 2) + recall) + 1e-8
+    f1_score = numerator / denominator
+    return f1_score
+
+
+def average_rouge(candidate, references):
+    rouge_1 = rouge_n(candidate, references, 1)
+    rouge_2 = rouge_n(candidate, references, 2)
+    rouge_lcs = rogue_l(split_into_words(candidate),
+                        split_into_words(references))
+    avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3
+    print("rouge_1:", rouge_1)
+    print("rouge_2:", rouge_2)
+    print("rouge_lcs:", rouge_lcs)
+    print("average:", avg_rouge)
+
+
 def main():
-	x = "The quick brown fox jumped over the wall"
-	y = "The fast black dog and fox jumped into the wall"
-	x_words = split_into_words(x)
-	y_words = split_into_words(y)
-	print(x_words)
-	lcs = len_lcs(x_words,y_words)
-	average_rouge(x, y )
-   
+    x = "The quick brown fox jumped over the wall"
+    y = "The fast black dog and fox jumped into the wall"
+    x_words = split_into_words(x)
+    y_words = split_into_words(y)
+    print(x_words)
+    lcs = len_lcs(x_words, y_words)
+    average_rouge(x, y)
+
+
 if __name__ == "__main__":
     main()

From 2a0c3434e2d545e7eb462cb873994f7fd8fd5c2f Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605 <priya.deshpande.1605@gmail.com>
Date: Thu, 8 Apr 2021 20:57:30 +0530
Subject: [PATCH 7/8] formatting the code

---
 torchnlp/metrics/rouge.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
index cf154f6..f7475b4 100644
--- a/torchnlp/metrics/rouge.py
+++ b/torchnlp/metrics/rouge.py
@@ -1,5 +1,3 @@
-import itertools
-import numpy as np
 
 
 def _get_ngrams(n, text):
@@ -106,6 +104,7 @@ def main():
     y_words = split_into_words(y)
     print(x_words)
     lcs = len_lcs(x_words, y_words)
+    print(lcs)
     average_rouge(x, y)
 
 

From 352e844f3d0650d077f18acadac5968d8bc430db Mon Sep 17 00:00:00 2001
From: PriyaDeshpande1605 <priya.deshpande.1605@gmail.com>
Date: Thu, 8 Apr 2021 21:03:27 +0530
Subject: [PATCH 8/8] formatting the code

---
 torchnlp/metrics/rouge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
index f7475b4..2d997be 100644
--- a/torchnlp/metrics/rouge.py
+++ b/torchnlp/metrics/rouge.py
@@ -90,7 +90,7 @@ def average_rouge(candidate, references):
     rouge_2 = rouge_n(candidate, references, 2)
     rouge_lcs = rogue_l(split_into_words(candidate),
                         split_into_words(references))
-    avg_rouge = (rouge_1+rouge_2+rouge_lcs)/3
+    avg_rouge = (rouge_1 + rouge_2 + rouge_lcs) / 3
     print("rouge_1:", rouge_1)
     print("rouge_2:", rouge_2)
     print("rouge_lcs:", rouge_lcs)