diff --git a/README.md b/README.md
old mode 100755
new mode 100644
index 29ed58d..6624217
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+# PyTorch-NLP
+Basic Utilities for PyTorch Natural Language Processing (NLP)

Basic Utilities for PyTorch Natural Language Processing (NLP)
@@ -45,7 +47,6 @@ Load the IMDB dataset, for example:
```python
from torchnlp.datasets import imdb_dataset
-
# Load the imdb training dataset
train = imdb_dataset(train=True)
train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
@@ -55,17 +56,13 @@ Load a custom dataset, for example:
```python
from pathlib import Path
-
from torchnlp.download import download_file_maybe_extract
-
directory_path = Path('data/')
train_file_path = Path('trees/train.txt')
-
download_file_maybe_extract(
url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
directory=directory_path,
check_files=[train_file_path])
-
open(directory_path / train_file_path)
```
@@ -80,7 +77,6 @@ text into tokens whenever it encounters a whitespace character.
```python
from torchnlp.encoders.text import WhitespaceEncoder
-
loaded_data = ["now this ain't funny", "so don't you dare laugh"]
encoder = WhitespaceEncoder(loaded_data)
encoded_data = [encoder.encode(example) for example in loaded_data]
@@ -95,13 +91,10 @@ import torch
from torchnlp.samplers import BucketBatchSampler
from torchnlp.utils import collate_tensors
from torchnlp.encoders.text import stack_and_pad_tensors
-
encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]
-
train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
train_batch_sampler = BucketBatchSampler(
train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0])
-
batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches]
```
@@ -128,9 +121,7 @@ Wrap any code that's random, with `fork_rng` and you'll be good to go, like so:
import random
import numpy
import torch
-
from torchnlp.random import fork_rng
-
with fork_rng(seed=123): # Ensure determinism
print('Random:', random.randint(1, 2**31))
print('Numpy:', numpy.random.randint(1, 2**31))
@@ -154,9 +145,7 @@ pre-trained word vectors to set your embeddings, like so:
import torch
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.word_to_vector import GloVe
-
encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"])
-
vocab_set = set(encoder.vocab)
pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set)
embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim)
@@ -171,10 +160,8 @@ For example, from the neural network package, apply the state-of-the-art `Locked
```python
import torch
from torchnlp.nn import LockedDropout
-
input_ = torch.randn(6, 3, 10)
dropout = LockedDropout(0.5)
-
# Apply a LockedDropout to `input_`
dropout(input_) # RETURNS: torch.FloatTensor (6x3x10)
```
@@ -185,10 +172,8 @@ Compute common NLP metrics such as the BLEU score.
```python
from torchnlp.metrics import get_moses_multi_bleu
-
hypotheses = ["The brown fox jumps over the dog 笑"]
references = ["The quick brown fox jumps over the lazy dog 笑"]
-
# Compute BLEU score with the official BLEU perl script
get_moses_multi_bleu(hypotheses, references, lowercase=True) # RETURNS: 47.9
```
diff --git a/torchnlp/metrics/rouge.py b/torchnlp/metrics/rouge.py
new file mode 100644
index 0000000..2d997be
--- /dev/null
+++ b/torchnlp/metrics/rouge.py
@@ -0,0 +1,112 @@
+
+
+def _get_ngrams(n, text):
+
+ ngram_set = set()
+ text_length = len(text)
+ max_index_ngram_start = text_length - n
+ for i in range(max_index_ngram_start + 1):
+ ngram_set.add(tuple(text[i:i + n]))
+ return ngram_set
+
+
+def _get_word_ngrams(n, sentences):
+ """Calculates word n-grams for multiple sentences.
+ """
+ assert len(sentences) > 0
+ assert n > 0
+
+ words = split_into_words(sentences)
+ return _get_ngrams(n, words)
+
+
+def rouge_n(evaluated_sentences, reference_sentences, n=2):
+
+ if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+ raise ValueError("Collections must contain at least 1 sentence.")
+
+ evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+ reference_ngrams = _get_word_ngrams(n, reference_sentences)
+ reference_count = len(reference_ngrams)
+ evaluated_count = len(evaluated_ngrams)
+
+ # Gets the overlapping ngrams between evaluated and reference
+ overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+ overlapping_count = len(overlapping_ngrams)
+
+ # Handle edge case. This isn't mathematically correct, but it's good enough
+ if evaluated_count == 0:
+ precision = 0.0
+ else:
+ precision = overlapping_count / evaluated_count
+
+ if reference_count == 0:
+ recall = 0.0
+ else:
+ recall = overlapping_count / reference_count
+
+ f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+ # return overlapping_count / reference_count
+ return f1_score
+
+
+def len_lcs(x, y):
+
+ n, m = len(x), len(y)
+ table = dict()
+ for i in range(n + 1):
+ for j in range(m + 1):
+ if i == 0 or j == 0:
+ table[i, j] = 0
+ elif x[i - 1] == y[j - 1]:
+ table[i, j] = table[i - 1, j - 1] + 1
+ else:
+ table[i, j] = max(table[i - 1, j], table[i, j - 1])
+ return table[n, m]
+
+
+def split_into_words(sentences):
+ """Splits multiple sentences into words and flattens the result"""
+ return list(sentences.split(" "))
+
+
+def rogue_l(candidate, references):
+ lcs = len_lcs(candidate, references)
+ len_x = len(candidate)
+ len_y = len(references)
+
+ recall = lcs / len_y
+ precision = lcs / len_x
+ beta = precision / (recall + 1e-12)
+ numerator = (1 + (beta ** 2)) * (precision * recall)
+ denominator = (precision * (beta ** 2) + recall) + 1e-8
+ f1_score = numerator / denominator
+ return f1_score
+
+
+def average_rouge(candidate, references):
+ rouge_1 = rouge_n(candidate, references, 1)
+ rouge_2 = rouge_n(candidate, references, 2)
+ rouge_lcs = rogue_l(split_into_words(candidate),
+ split_into_words(references))
+ avg_rouge = (rouge_1 + rouge_2 + rouge_lcs) / 3
+ print("rouge_1:", rouge_1)
+ print("rouge_2:", rouge_2)
+ print("rouge_lcs:", rouge_lcs)
+ print("average:", avg_rouge)
+
+
+def main():
+ x = "The quick brown fox jumped over the wall"
+ y = "The fast black dog and fox jumped into the wall"
+ x_words = split_into_words(x)
+ y_words = split_into_words(y)
+ print(x_words)
+ lcs = len_lcs(x_words, y_words)
+ print(lcs)
+ average_rouge(x, y)
+
+
+if __name__ == "__main__":
+ main()