from typing import Any

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoModel,
    AutoTokenizer,
    DataCollatorWithPadding,
    PreTrainedTokenizerBase,
    BatchEncoding,
)
import pandas as pd
from sklearn.model_selection import train_test_split

/Users/ljvmiranda/Developer/ioaiph26-nlp/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

label2id = ...  # (A) Implement text label into integers
id2label = ...  # (B) Just the reverse

data_loader = DataLoader(
    dataset,  # (C) Dataset class (SkyAssistDataset)
    collate_fn=data_collator,  # (D) DataCollatorWithPadding (SkyAssistDataCollator)
    batch_size=256,  # affects the collate_fn
    shuffle=True,  # affects the collate_fn
)

full_train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df, dev_df = train_test_split(
    full_train_df,
    test_size=0.1,  # 10% held out for the dev set
    random_state=42,  # fixed seed so the split is reproducible
    shuffle=True,
)

train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)

# (A) Implement text label mapping to integers
# Intents: one label per row
# Slots: the column is space-separated, so split every row and collect the unique tags
intents = sorted(train_df["intent"].unique())
label2id_intents = {label: i for i, label in enumerate(intents)}
slots = sorted({tag for row in train_df["slots"] for tag in row.split()})
label2id_slots = {label: i for i, label in enumerate(slots)}

# (B) Just the inverse
id2label_intents = {i: label for label, i in label2id_intents.items()}
id2label_slots = {i: label for label, i in label2id_slots.items()}

# (C) How to represent a single instance?
class SkyAssistDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        label2id_intents: dict[str, int],
        label2id_slots: dict[str, int],
    ) -> None:
        self.df = df.reset_index(drop=True)
        self.label2id_intents = label2id_intents
        self.label2id_slots = label2id_slots

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict[str, Any]:
        row = self.df.iloc[idx]
        words = row["text"].split()  # ["i", "want", "boston", ...]

        # One id per word
        slot_labels = [
            self.label2id_slots.get(slot, 0) for slot in row["slots"].split()
        ]

        # One id for the whole text
        intent_label = self.label2id_intents.get(row["intent"], 0)

        return {
            "input_text": words,
            "intent_label": intent_label,
            "slot_labels": slot_labels,
        }

# (D) How to represent a batch (group of instances?)
class SkyAssistDataCollator(DataCollatorWithPadding):
    def __init__(
        self, tokenizer: PreTrainedTokenizerBase, slot_pad_id: int = -100
    ) -> None:
        super().__init__(tokenizer)
        self.slot_pad_id = slot_pad_id  # positions the loss should ignore

    def __call__(self, features: list[dict[str, Any]]) -> BatchEncoding:
        # Tokenize the already-split words and pad the whole batch at once
        batch = self.tokenizer(
            [f["input_text"] for f in features],
            is_split_into_words=True,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

        # Line up the per-word slot labels with the (sub)word tokens
        aligned_slots = []
        for i, f in enumerate(features):
            word_ids = batch.word_ids(i)  # which original word each token came from
            labels = []
            previous_word = None
            for word_id in word_ids:
                if word_id is None:
                    # special tokens: [CLS], [SEP], [PAD]
                    labels.append(self.slot_pad_id)
                elif word_id != previous_word:
                    # first subword of a word gets the real label
                    labels.append(f["slot_labels"][word_id])
                else:
                    # extra subwords of the same word are ignored
                    labels.append(self.slot_pad_id)
                previous_word = word_id
            aligned_slots.append(labels)

        # Attach the labels as tensors
        batch["slot_labels"] = torch.tensor(aligned_slots, dtype=torch.long)
        batch["intent_label"] = torch.tensor(
            [f["intent_label"] for f in features], dtype=torch.long
        )
        return batch

def make_submission(
    test_df: pd.DataFrame,
    intent_preds: list[int],  # one intent id per row, in test_df order
    slot_preds: list[list[int]],  # one list of slot ids per row (one id per word)
    id2label_intents: dict[int, str],
    id2label_slots: dict[int, str],
    path: str = "submission.csv",
) -> pd.DataFrame:
    """Format model predictions into the submission CSV. No inference here."""
    intents = [id2label_intents[i] for i in intent_preds]
    slots = [" ".join(id2label_slots[s] for s in row) for row in slot_preds]

    submission = pd.DataFrame(
        {
            "id": test_df["id"].tolist(),
            "intent": intents,
            "slots": slots,
        }
    )
    submission.to_csv(path, index=False)
    return submission

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Wrap each split into a DataLoader using the Dataset (C) and collator (D).
collator = SkyAssistDataCollator(tokenizer)

train_ds = SkyAssistDataset(train_df, label2id_intents, label2id_slots)
dev_ds = SkyAssistDataset(dev_df, label2id_intents, label2id_slots)
test_ds = SkyAssistDataset(test_df, label2id_intents, label2id_slots)

train_loader = DataLoader(train_ds, collate_fn=collator, batch_size=256, shuffle=True)
dev_loader = DataLoader(dev_ds, collate_fn=collator, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, collate_fn=collator, batch_size=32, shuffle=False)

# A single multi-task model: one shared encoder + two heads.
class SkyAssistModel(nn.Module):
    def __init__(self, model_name: str, num_intents: int, num_slots: int) -> None:
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)  # the shared BERT body
        hidden_size = self.encoder.config.hidden_size
        self.intent_head = nn.Linear(
            hidden_size, num_intents
        )  # whole sentence -> intent
        self.slot_head = nn.Linear(hidden_size, num_slots)  # each token -> slot

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        slot_labels: torch.Tensor | None = None,
        intent_label: torch.Tensor | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        # B = Batch Size, T = Tokens, H = Hidden Size

        # Let BERT read the batch (the mask tells it to skip padding)
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # For each token, a vector capturing its meaning in context (a "context vector")
        sequence_output = outputs.last_hidden_state  # (B, T, H)

        # The [CLS] token (position 0) stands for the whole sentence
        cls_output = sequence_output[:, 0]  # (B, H)

        # Sentence summary -> one score per intent
        intent_logits = self.intent_head(cls_output)  # (B, num_intents)

        # Each token -> one score per slot label
        slot_logits = self.slot_head(sequence_output)  # (B, T, num_slots)

        # Only score ourselves when labels are given (i.e. during training)
        loss = None
        if intent_label is not None and slot_labels is not None:
            # CrossEntropy skips positions labeled -100 by default
            loss_fn = nn.CrossEntropyLoss()

            # How wrong the intent guess is
            intent_loss = loss_fn(intent_logits, intent_label)

            # How wrong the slot guesses are (flattened to (B*T, num_slots))
            slot_loss = loss_fn(
                slot_logits.reshape(-1, slot_logits.size(-1)),
                slot_labels.reshape(-1),
            )

            # Learn both tasks at once by adding the two losses
            loss = intent_loss + slot_loss

        # loss is for training, the logits are for predicting
        return {
            "loss": loss,
            "intent_logits": intent_logits,
            "slot_logits": slot_logits,
        }


model = SkyAssistModel(
    model_name,
    num_intents=len(label2id_intents),
    num_slots=len(label2id_slots),
)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 14618.39it/s]
[transformers] BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED:	can be ignored when loading from different task/architecture; not ok if you expect identical arch.

# Train the model: adjust the encoder + heads to fit the training data ("fine-tuning").
device = (
    "cuda"
    if torch.cuda.is_available()
    else ("mps" if torch.backends.mps.is_available() else "cpu")
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        # Move the batch tensors onto the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass: the model returns the combined intent + slot loss
        outputs = model(**batch)
        loss = outputs["loss"]

        # Backward pass: compute gradients, take one step, then reset gradients
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    print(
        f"epoch {epoch + 1}/{num_epochs}  train loss: {total_loss / len(train_loader):.4f}"
    )

epoch 1/3  train loss: 4.9622
epoch 2/3  train loss: 2.4199
epoch 3/3  train loss: 1.7027

# Inference: run the trained model on the test set, then format the submission.
model.eval()
intent_preds: list[int] = []
slot_preds: list[list[int]] = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        # Best intent per sentence, best slot per token
        batch_intents = outputs["intent_logits"].argmax(dim=-1)  # (B,)
        batch_slots = outputs["slot_logits"].argmax(dim=-1)  # (B, T)
        true = batch["slot_labels"]  # (B, T), -100 = ignore

        for i in range(batch_intents.size(0)):
            intent_preds.append(batch_intents[i].item())

            # Keep one prediction per word (the first-subword positions)
            mask = true[i] != -100
            slot_preds.append(batch_slots[i][mask].tolist())

submission = make_submission(
    test_df, intent_preds, slot_preds, id2label_intents, id2label_slots
)
submission.head()

from transformers import get_linear_schedule_with_warmup

num_epochs = 10
num_steps = num_epochs * len(train_loader)

optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_steps),  # warm up over the first 10% of steps
    num_training_steps=num_steps,
)

# ...then call scheduler.step() right after optimizer.step() in the loop

# pip install pytorch-crf
from torchcrf import CRF


class SkyAssistModelCRF(nn.Module):
    def __init__(self, model_name, num_intents, num_slots):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.intent_head = nn.Linear(hidden, num_intents)
        self.slot_head = nn.Linear(hidden, num_slots)
        self.crf = CRF(
            num_slots, batch_first=True
        )  # learns which labels are allowed to follow which

    def forward(
        self, input_ids, attention_mask, slot_labels=None, intent_label=None, **kwargs
    ):
        seq = self.encoder(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        intent_logits = self.intent_head(seq[:, 0])
        slot_logits = self.slot_head(seq)  # the CRF's "emissions"

        # The CRF cannot read -100, so mask those spots and give them a dummy label
        mask = attention_mask.bool()
        mask[:, 0] = True  # the CRF requires the first step to be active

        loss = None
        if slot_labels is not None and intent_label is not None:
            safe = slot_labels.clone()
            safe[slot_labels == -100] = (
                0  # any valid id; the mask makes the CRF ignore them
            )
            slot_loss = -self.crf(
                slot_logits, safe, mask=mask
            )  # one loss over the whole label sequence
            intent_loss = nn.functional.cross_entropy(intent_logits, intent_label)
            loss = intent_loss + slot_loss

        return {
            "loss": loss,
            "intent_logits": intent_logits,
            "slot_logits": slot_logits,
        }

import random
from collections import defaultdict


def extract_spans(words, tags):
    """Find each slot span as (slot_type, start, end), with end exclusive."""
    spans, i = [], 0
    while i < len(tags):
        if tags[i].startswith("B-"):
            slot_type = tags[i][2:]
            j = i + 1
            while j < len(tags) and tags[j] == f"I-{slot_type}":
                j += 1
            spans.append((slot_type, i, j))
            i = j
        else:
            i += 1
    return spans


def build_value_pool(df):
    """Collect every surface form per slot type, from the training data only."""
    pool = defaultdict(list)
    for _, row in df.iterrows():
        words, tags = row["text"].split(), row["slots"].split()
        for slot_type, start, end in extract_spans(words, tags):
            pool[slot_type].append(words[start:end])
    return pool


value_pool = build_value_pool(train_df)
print(
    "cities seen as toloc.city_name:",
    [" ".join(v) for v in value_pool["toloc.city_name"][:5]],
)

cities seen as toloc.city_name: ['san francisco', 'baltimore', 'philadelphia', 'atlanta', 'salt lake city']

def augment_row(text, slots, value_pool, p=0.5, rng=random.Random(42)):
    """Swap each slot span with another value of the same type (entity swapping)."""
    words, tags = text.split(), slots.split()
    spans = {
        start: (slot_type, end) for slot_type, start, end in extract_spans(words, tags)
    }

    new_words, new_tags, i = [], [], 0
    while i < len(words):
        if i in spans:
            slot_type, end = spans[i]
            pool = value_pool.get(slot_type, [])
            # With probability p, swap in another value of the same type
            new_value = rng.choice(pool) if pool and rng.random() < p else words[i:end]
            new_words += new_value
            new_tags += [f"B-{slot_type}"] + [f"I-{slot_type}"] * (len(new_value) - 1)
            i = end
        else:
            new_words.append(words[i])
            new_tags.append(tags[i])
            i += 1
    return " ".join(new_words), " ".join(new_tags)


# Make one augmented copy of each training row, then add them to the training set
augmented = []
for _, row in train_df.iterrows():
    new_text, new_slots = augment_row(row["text"], row["slots"], value_pool)
    augmented.append(
        {"id": row["id"], "intent": row["intent"], "text": new_text, "slots": new_slots}
    )

train_df_aug = pd.concat([train_df, pd.DataFrame(augmented)], ignore_index=True)

print("original: ", train_df.iloc[0]["text"])
print("augmented:", augmented[0]["text"])

original:  please show me any united flights including connections between boston and san francisco at 5 in the evening
augmented: please show me any united flights including connect between baltimore and san francisco at 5 in the evening

column	description
`id`	unique row identifier
`intent`	the single intent label for the whole utterance (e.g. `flight`, `airfare`, `ground_service`)
`text`	the utterance, lower-cased and split into space-separated words
`slots`	a space-separated BIO tag for every word in `text`, aligned one-to-one

Sign something is wrong	Likely cause	How to probe and check
Loss is flat, `NaN`, or bouncing around	loss not wired up, learning rate too high, or `zero_grad` missing	Overfit 1 to 2 batches for many steps. The loss should fall close to 0. If it cannot, the bug is structural, not a tuning issue.
Dev accuracy looks high but the predictions feel useless	class imbalance, so the model just predicts the majority (`O` for slots, `flight` for intent)	Report F1 (and look at it per class), compare against an always-predict-majority baseline, and print the distribution of predictions.
A row's predicted slot count does not equal its number of words	alignment or truncation bug	For every row, assert `len(pred_slots) == len(text.split())`. Inspect one long sentence by hand.
The submission seems scrambled against the ids	`test_loader` was shuffled, or the row order changed somewhere	Confirm `test_loader` uses `shuffle=False`, then spot-check a few `id`s against their `text`.
Illegal tag sequences (an `I-x` with no `B-x` before it)	each token is labeled independently, so nothing enforces valid order	Scan predictions for any `I-` tag not preceded by a matching `B-`.
Labels look shifted or just wrong, even though the code runs fine	`id2label` is not the exact inverse of `label2id`, or a special-token offset	Decode one example and print the word, the gold label, and the predicted label side by side.
`CrossEntropyLoss` throws an index error	a label id is greater than or equal to the number of classes (head size mismatch)	Check that `num_slots == len(label2id_slots)` and that every label id is either in range or `-100`.
Dev score is far below the train score	overfitting to the training set	Track train loss vs dev loss (or F1) across epochs and watch the gap.

Problem 1: SkyAssist¶

Dataset¶

Restrictions¶

Step 1: (R)ead the problem¶

Step 2: (I)mplement the Baseline¶

Step 3: (C)heck for Errors¶

Step 4: (E)nhance the Solution¶

Trivial enhancements¶

Stronger Encoder¶

Hyperparameter Tuning¶

Architectural Improvements¶

Data-centric improvements¶

	id	intent	slots
0	0	flight	O O O O O O O O B-fromloc.city_name O B-toloc....
1	1	airfare	O O O O O O O O B-fromloc.city_name O B-toloc....
2	2	flight	O O O O O O O O O B-fromloc.city_name O B-tolo...
3	3	flight	O O O O O O O O O B-fromloc.city_name O B-tolo...
4	4	flight	O O O O O O B-fromloc.city_name O B-toloc.city...