import os
import random

import numpy as np

# Turn off TensorFlow logging messages
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# For reproducibility
seed = 0
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(0)
random.seed(0)

import pandas as pd


DISPLAY_ALL_TEXT = True

pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50)

# Download the spaCy english model
# ! python -m spacy download en_core_web_sm

from utils import load_spam_dataset

df_train, df_test = load_spam_dataset(load_train_labels=True)

# We pull out the label vectors for ease of use later
Y_train = df_train["label"].values
Y_test = df_test["label"].values

WARNING:tensorflow:From C:\Users\akhil\OneDrive - Northeastern University\Work Space\Academics\MLOps\mlops_labs\data_labelling_lab\data_labeling_env\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

df_train.head()

from snorkel.preprocess.nlp import SpacyPreprocessor

spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)

import names
from snorkel.augmentation import transformation_function

# Pregenerate some random person names to replace existing ones with
# for the transformation strategies below
replacement_names = [names.get_full_name() for _ in range(50)]


# Replace a random named entity with a different entity of the same type.
@transformation_function(pre=[spacy])
def change_person(x):
    person_names = [ent.text for ent in x.doc.ents if ent.label_ == "PERSON"]
    # If there is at least one person name, replace a random one. Else return None.
    if person_names:
        name_to_replace = np.random.choice(person_names)
        replacement_name = np.random.choice(replacement_names)
        x.text = x.text.replace(name_to_replace, replacement_name)
        return x


# Swap two adjectives at random.
@transformation_function(pre=[spacy])
def swap_adjectives(x):
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"] # pos: parts-of-speech
    # Check that there are at least two adjectives to swap.
    if len(adjective_idxs) >= 2:
        idx1, idx2 = sorted(np.random.choice(adjective_idxs, 2, replace=False))
        # Swap tokens in positions idx1 and idx2.
        x.text = " ".join(
            [
                x.doc[:idx1].text,
                x.doc[idx2].text,
                x.doc[1 + idx1 : idx2].text,
                x.doc[idx1].text,
                x.doc[1 + idx2 :].text,
            ]
        )
        return x

import nltk
from nltk.corpus import wordnet as wn

nltk.download("wordnet")


def get_synonym(word, pos=None):
    """Get synonym for word given its part-of-speech (pos)."""
    synsets = wn.synsets(word, pos=pos)
    # Return None if wordnet has no synsets (synonym sets) for this word and pos.
    if synsets:
        words = [lemma.name() for lemma in synsets[0].lemmas()]
        if words[0].lower() != word.lower():  # Skip if synonym is same as word.
            # Multi word synonyms in wordnet use '_' as a separator e.g. reckon_with. Replace it with space.
            return words[0].replace("_", " ")


def replace_token(spacy_doc, idx, replacement):
    """Replace token in position idx with replacement."""
    return " ".join([spacy_doc[:idx].text, replacement, spacy_doc[1 + idx :].text])


@transformation_function(pre=[spacy])
def replace_verb_with_synonym(x):
    # Get indices of verb tokens in sentence.
    verb_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "VERB"]
    if verb_idxs:
        # Pick random verb idx to replace.
        idx = np.random.choice(verb_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="v")
        # If there's a valid verb synonym, replace it. Otherwise, return None.
        if synonym:
            x.text = replace_token(x.doc, idx, synonym)
            return x


@transformation_function(pre=[spacy])
def replace_noun_with_synonym(x):
    # Get indices of noun tokens in sentence.
    noun_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "NOUN"]
    if noun_idxs:
        # Pick random noun idx to replace.
        idx = np.random.choice(noun_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="n")
        # If there's a valid noun synonym, replace it. Otherwise, return None.
        if synonym:
            x.text = replace_token(x.doc, idx, synonym)
            return x


@transformation_function(pre=[spacy])
def replace_adjective_with_synonym(x):
    # Get indices of adjective tokens in sentence.
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    if adjective_idxs:
        # Pick random adjective idx to replace.
        idx = np.random.choice(adjective_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="a")
        # If there's a valid adjective synonym, replace it. Otherwise, return None.
        if synonym:
            x.text = replace_token(x.doc, idx, synonym)
            return x

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

tfs = [
    change_person,
    swap_adjectives,
    replace_verb_with_synonym,
    replace_noun_with_synonym,
    replace_adjective_with_synonym,
]

from utils import preview_tfs

preview_tfs(df_train, tfs)

from snorkel.augmentation import RandomPolicy

random_policy = RandomPolicy(
    len(tfs), sequence_length=2, n_per_original=2, keep_original=True
)

from snorkel.augmentation import MeanFieldPolicy

mean_field_policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=2,
    n_per_original=2,
    keep_original=True,
    p=[0.05, 0.05, 0.3, 0.3, 0.3],
)

from snorkel.augmentation import PandasTFApplier

tf_applier = PandasTFApplier(tfs, mean_field_policy)
df_train_augmented = tf_applier.apply(df_train)
Y_train_augmented = df_train_augmented["label"].values

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1586/1586 [00:32<00:00, 48.65it/s]
100%|██████████| 1586/1586 [00:28<00:00, 56.18it/s]

print(f"Original training set size: {len(df_train)}")
print(f"Augmented training set size: {len(df_train_augmented)}")

Original training set size: 1586
Augmented training set size: 2412

import tensorflow as tf

session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads=1, inter_op_parallelism_threads=1
)

tf.compat.v1.set_random_seed(0)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

WARNING:tensorflow:From C:\Users\akhil\AppData\Local\Temp\ipykernel_22944\482977161.py:3: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

from utils import featurize_df_tokens, get_keras_lstm

X_train = featurize_df_tokens(df_train)
X_train_augmented = featurize_df_tokens(df_train_augmented)
X_test = featurize_df_tokens(df_test)


def train_and_test(X_train, Y_train, X_test=X_test, Y_test=Y_test, num_buckets=30000):
    # Define a vanilla LSTM model with Keras
    lstm_model = get_keras_lstm(num_buckets)
    lstm_model.fit(X_train, Y_train, epochs=5, verbose=0)
    preds_test = lstm_model.predict(X_test)[:, 0] > 0.5
    return (preds_test == Y_test).mean()


acc_augmented = train_and_test(X_train_augmented, Y_train_augmented)
acc_original = train_and_test(X_train, Y_train)

WARNING:tensorflow:From C:\Users\akhil\OneDrive - Northeastern University\Work Space\Academics\MLOps\mlops_labs\data_labelling_lab\data_labeling_env\lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

WARNING:tensorflow:From C:\Users\akhil\OneDrive - Northeastern University\Work Space\Academics\MLOps\mlops_labs\data_labelling_lab\data_labeling_env\lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

WARNING:tensorflow:From C:\Users\akhil\OneDrive - Northeastern University\Work Space\Academics\MLOps\mlops_labs\data_labelling_lab\data_labeling_env\lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.

8/8 [==============================] - 0s 5ms/step
8/8 [==============================] - 0s 5ms/step

print(f"Test Accuracy (original training data): {100 * acc_original:.1f}%")
print(f"Test Accuracy (augmented training data): {100 * acc_augmented:.1f}%")

Test Accuracy (original training data): 47.2%
Test Accuracy (augmented training data): 71.6%

	author	date	text	label	video
0	Alessandro leite	2014-11-05T22:21:36	pls http://www10.vakinha.com.br/VaquinhaE.aspx?e=313327 help me get vip gun cross fire al	1	1
1	Salim Tayara	2014-11-02T14:33:30	if your like drones, plz subscribe to Kamal Tayara. He takes videos with his drone that are absolutely beautiful.	1	1
2	Phuc Ly	2014-01-20T15:27:47	go here to check the views :3	0	1
3	DropShotSk8r	2014-01-19T04:27:18	Came here to check the views, goodbye.	0	1
4	css403	2014-11-07T14:25:48	i am 2,126,492,636 viewer :D	0	1

	TF Name	Original Text	Transformed Text
0	change_person	"eye of the tiger" "i am the champion" seems like katy perry is using titles of old rock songs for lyrics..	"eye of the tiger" "i am the champion" seems like Laura Maldonado is using titles of old rock songs for lyrics..
1	swap_adjectives	hey guys look im aware im spamming and it pisses people off but please take a moment to check out my music. im a young rapper and i love to do it and i just wanna share my music with more people just click my picture and then see if you like my stuff	hey guys look im more im spamming and it pisses people off but please take a moment to check out my music. im a young rapper and i love to do it and i just wanna share my music with aware people just click my picture and then see if you like my stuff
2	replace_verb_with_synonym	"eye of the tiger" "i am the champion" seems like katy perry is using titles of old rock songs for lyrics..	"eye of the tiger" "i am the champion" seems like katy perry is use titles of old rock songs for lyrics..
3	replace_noun_with_synonym	"eye of the tiger" "i am the champion" seems like katy perry is using titles of old rock songs for lyrics..	"eye of the tiger" "i am the champion" seems like katy perry is using title of old rock songs for lyrics..
4	replace_adjective_with_synonym	I started hating Katy Perry after finding out that she stole all of the ideas on her videos from an old comic book. Yet, her music is catchy.	I started hating Katy Perry after finding out that she stole all of the ideas on her videos from an old amusing book. Yet, her music is catchy.

📈 Snorkel Intro Tutorial: Data Augmentation¶

1. Loading Data¶

2. Writing Transformation Functions (TFs)¶

3. Applying Transformation Functions¶

4. Training A Model¶