DataCoLab-llt · moslemmj · Nov 11, 2020 · Nov 17, 2020 · Nov 19, 2020 · Nov 19, 2020
diff --git a/dataset/manythings/_about.txt b/dataset/manythings/_about.txt
@@ -0,0 +1,60 @@
+** Info **
+
+Check for newest version here:
+  http://www.manythings.org/anki/
+Date of this file:
+  2020-08-23
+
+This data is from the sentences_detailed.csv file from tatoeba.org.
+http://tatoeba.org/files/downloads/sentences_detailed.csv
+
+
+
+** Terms of Use **
+
+See the terms of use.
+These files have been released under the same license as the
+source.
+
+http://tatoeba.org/eng/terms_of_use
+http://creativecommons.org/licenses/by/2.0
+
+Attribution: www.manythings.org/anki and tatoeba.org
+
+
+
+** Warnings ** 
+
+The data from the Tatoeba Project contains errors.
+
+To lower the number of errors you are likely to see, only
+sentences by native speakers and proofread sentences have
+been included.
+
+For the non-English language, I made these (possibly wrong)
+assumptions.
+Assumption 1: Sentences written by native speakers can be
+trusted.
+Assumption 2: Contributors to the Tatoeba Project are honest
+about what their native language is.
+
+For English, I used the sentences that I have proofread
+and thought were OK.
+Of course, I may have missed a few errors.
+
+
+
+** Downloading Anki ** 
+
+See http://ankisrs.net/
+
+
+
+** Importing into Anki ** 
+
+Information is at http://ankisrs.net/docs/manual.html#importing
+
+Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
+You can choose:
+1. not to allow duplicates (alternate translations) as cards.
+2. allow duplicates (alternate translations) as cards.
diff --git a/dataset/manythings/pes.txt b/dataset/manythings/pes.txt
diff --git a/models/__init__.py b/models/__init__.py
diff --git a/models/evaluator.py b/models/evaluator.py
@@ -0,0 +1,30 @@
+from nltk.translate.bleu_score import sentence_bleu
+
+
+def bleu_evaluation(prediction, truth, verbose=False):
+    """
+    evaluating a translation sentence against one or more reference sentences.
+    Args:
+        prediction (string): translated sentence
+        truth (list): list of reference sentences
+        verbose (boolean): to print or not to print truth and prediction sentences.
+    Returns:
+        nothings for now, just print BLEU scores
+    Example:
+        truth = [['i love cats'], ['i love hats']]
+        candidate = 'i love cat'
+        bleu_evaluation(candidate, truth)
+    """
+    if verbose:
+        print('prediction=[%s], truth=%s' % (prediction, truth))
+    prediction_token = prediction.split()
+    truth_token = list()
+    for sen in prediction:
+        truth_token.append(sen[0].split())
+
+    print('BLEU, Individual 1-gram: %f' % sentence_bleu(truth_token, prediction_token, weights=(1, 0, 0, 0)))
+    print('BLEU, Individual 2-gram: %f' % sentence_bleu(truth_token, prediction_token, weights=(0, 1, 0, 0)))
+    print('BLEU, Individual 3-gram: %f' % sentence_bleu(truth_token, prediction_token, weights=(0, 0, 1, 0)))
+    print('BLEU, Individual 4-gram: %f' % sentence_bleu(truth_token, prediction_token, weights=(0, 0, 0, 1)))
+
+
diff --git a/models/model_utils.py b/models/model_utils.py
@@ -0,0 +1,66 @@
+import tensorflow as tf
+from preprocessing import create_dataset
+from sklearn.model_selection import train_test_split
+
+
+def tokenize(lang):
+    """
+    Create tokenizer according input,
+    create a word index and reverse word index
+    and pad each sentence to a maximum length.
+    Args:
+        lang (tuple): tuple of sentences for tokenizer
+    Returns:
+        tensor: numpy-ndarray for lang input
+        lang_tokenizer: keras tokenizer object for lang input
+    """
+    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
+    lang_tokenizer.fit_on_texts(lang)
+    tensor = lang_tokenizer.texts_to_sequences(lang)
+    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
+    return tensor, lang_tokenizer
+
+
+def load_dataset(path, num_examples=None):
+    """
+    Load dataset and prepare tensor and tokenizer for input and output language.
+    Args:
+        path (string): path to dataset
+        num_examples (int or None): number of word pairs required
+    Returns:
+        return tensor and tokenizer for input and output language.
+    """
+    targ_lang, inp_lang = create_dataset(path, num_examples)
+    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
+    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
+    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer
+
+
+def convert(lang_tokenizer, tensor):
+    """
+    Helper function for show word mapping(tensor to tokenizer index)
+    Args:
+        tensor (numpy array)
+        lang_tokenizer (tokenizer object)
+    Return:
+        nothing to return
+    """
+    for t in tensor:
+        if t != 0:
+            print("%d ----> %s" % (t, lang_tokenizer.index_word[t]))
+
+
+def main_process(path_to_file):
+    """
+    just for test, it's removed later.
+    """
+    num_examples = 30000
+    input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(path_to_file, num_examples)
+    # Split arrays or matrices into random train and test subsets
+    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
+                                                                                                    target_tensor,
+                                                                                                    test_size=0.2)
+    print("Input Language; index to word mapping")
+    convert(inp_lang_tokenizer, input_tensor_train[0])
+    print("Target Language; index to word mapping")
+    convert(targ_lang_tokenizer, target_tensor_train[0])
diff --git a/models/preprocessing.py b/models/preprocessing.py
@@ -0,0 +1,69 @@
+import unicodedata
+import re
+import io
+
+
+def is_english(S):
+    """
+    Recognizes whether the input language of the sentence is English or not.
+    Args:
+        S (string): input sentence
+    Returns:
+        Boolean
+    """
+    try:
+        S.encode(encoding='utf-8').decode('ascii')
+    except UnicodeDecodeError:
+        return False
+    else:
+        return True
+
+
+def unicode_to_ascii(S):
+    """
+    Convert unicode character to Ascii.
+    Args:
+        S (string): input sentence
+    Returns:
+        sentence (string): converted sentence to ascii
+    """
+    sentence = ''.join(c for c in unicodedata.normalize('NFD', S) if unicodedata.category(c) != 'Mn')
+    return sentence
+
+
+def preprocess_sentence(sentence):
+    """
+    Some preprocessing on sentence.
+    Args:
+        sentence (string): input sentence
+    Returns:
+        sentence (string) preprocessed sentence
+    """
+    sentence = unicode_to_ascii(sentence.lower().strip())
+    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
+    sentence = re.sub(r'[" "]+', " ", sentence)
+    sentence = sentence.strip()
+    if is_english(sentence):
+        sentence = '<start> ' + sentence + ' <end>'
+    else:
+        sentence = '<end> ' + sentence + ' <start>'
+    return sentence
+
+
+def create_dataset(path, num_examples):
+    """
+    Some work on dataset and prepare cleaned dataset.
+    Args:
+        path (string): path to file - dataset file
+        num_examples (int): number of word pairs required
+    Returns:
+        list of word pairs
+    """
+    clean_lines = list()
+    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
+    for i in lines:
+        english, persian = i.rstrip().split('\t', 1)
+        persian = persian.rstrip().split('\t', 1)[0]
+        clean_lines.append(persian + '\t' + english)
+    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in clean_lines[:num_examples]]
+    return zip(*word_pairs)
diff --git a/models/utils.py b/models/utils.py
@@ -0,0 +1,39 @@
+import codecs
+import numpy as np
+from sklearn.utils import shuffle
+
+
+def read_text_file(path_to_file):
+    """
+           read text file
+        Args:
+           path_to_file (string): path to text file
+        Returns:
+           string output
+    """
+    with codecs.open(path_to_file, 'r', encoding="utf-8") as file:
+        return file.read()
+
+
+def shuffle_dataset(data):
+    """
+        shuffle data
+    Args:
+        data (list): list of data
+    Returns:
+        numpy array output
+    """
+    data = np.array(data)
+    data = shuffle(data)
+    return data
+
+
+def get_max_length(items):
+    """
+        get maximum list items and return length
+    Args:
+        items (list): list of data
+    Returns:
+        int output
+    """
+    return max(len(item.split()) for item in items)