Fine-Tuning a BERT Model – MachineLearningMastery.com

Posted in

[ad_1]

import collections

import dataclasses

import functools

 

import torch

import torch.nn as nn

import torch.optim as optim

import tqdm

from datasets import load_dataset

from tokenizers import Tokenizer

from torch import Tensor

 

 

# BERT config and model defined previously

@dataclasses.dataclass

class BertConfig:

    “”“Configuration for BERT model.”“”

    vocab_size: int = 30522

    num_layers: int = 12

    hidden_size: int = 768

    num_heads: int = 12

    dropout_prob: float = 0.1

    pad_id: int = 0

    max_seq_len: int = 512

    num_types: int = 2

 

class BertBlock(nn.Module):

    “”“One transformer block in BERT.”“”

    def __init__(self, hidden_size: int, num_heads: int, dropout_prob: float):

        super().__init__()

        self.attention = nn.MultiheadAttention(hidden_size, num_heads,

                                               dropout=dropout_prob, batch_first=True)

        self.attn_norm = nn.LayerNorm(hidden_size)

        self.ff_norm = nn.LayerNorm(hidden_size)

        self.dropout = nn.Dropout(dropout_prob)

        self.feed_forward = nn.Sequential(

            nn.Linear(hidden_size, 4 * hidden_size),

            nn.GELU(),

            nn.Linear(4 * hidden_size, hidden_size),

        )

 

    def forward(self, x: Tensor, pad_mask: Tensor) -> Tensor:

        # self-attention with padding mask and post-norm

        attn_output, _ = self.attention(x, x, x, key_padding_mask=pad_mask)

        x = self.attn_norm(x + attn_output)

        # feed-forward with GeLU activation and post-norm

        ff_output = self.feed_forward(x)

        x = self.ff_norm(x + self.dropout(ff_output))

        return x

 

class BertPooler(nn.Module):

    “”“Pooler layer for BERT to process the [CLS] token output.”“”

    def __init__(self, hidden_size: int):

        super().__init__()

        self.dense = nn.Linear(hidden_size, hidden_size)

        self.activation = nn.Tanh()

 

    def forward(self, x: Tensor) -> Tensor:

        x = self.dense(x)

        x = self.activation(x)

        return x

 

class BertModel(nn.Module):

    “”“Backbone of BERT model.”“”

    def __init__(self, config: BertConfig):

        super().__init__()

        # embedding layers

        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size,

                                            padding_idx=config.pad_id)

        self.type_embeddings = nn.Embedding(config.num_types, config.hidden_size)

        self.position_embeddings = nn.Embedding(config.max_seq_len, config.hidden_size)

        self.embeddings_norm = nn.LayerNorm(config.hidden_size)

        self.embeddings_dropout = nn.Dropout(config.dropout_prob)

        # transformer blocks

        self.blocks = nn.ModuleList([

            BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)

            for _ in range(config.num_layers)

        ])

        # [CLS] pooler layer

        self.pooler = BertPooler(config.hidden_size)

 

    def forward(self, input_ids: Tensor, token_type_ids: Tensor, pad_id: int = 0,

                ) -> tuple[Tensor, Tensor]:

        # create attention mask for padding tokens

        pad_mask = input_ids == pad_id

        # convert integer tokens to embedding vectors

        batch_size, seq_len = input_ids.shape

        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)

        position_embeddings = self.position_embeddings(position_ids)

        type_embeddings = self.type_embeddings(token_type_ids)

        token_embeddings = self.word_embeddings(input_ids)

        x = token_embeddings + type_embeddings + position_embeddings

        x = self.embeddings_norm(x)

        x = self.embeddings_dropout(x)

        # process the sequence with transformer blocks

        for block in self.blocks:

            x = block(x, pad_mask)

        # pool the hidden state of the `[CLS]` token

        pooled_output = self.pooler(x[:, 0, :])

        return x, pooled_output

 

# Define new BERT model for question answering

class BertForQuestionAnswering(nn.Module):

    “”“BERT model for SQuAD question answering.”“”

    def __init__(self, config: BertConfig):

        super().__init__()

        self.bert = BertModel(config)

        # Two outputs: start and end position logits

        self.qa_outputs = nn.Linear(config.hidden_size, 2)

 

    def forward(self,

        input_ids: Tensor,

        token_type_ids: Tensor,

        pad_id: int = 0,

    ) -> tuple[Tensor, Tensor]:

        # Get sequence output from BERT (batch_size, seq_len, hidden_size)

        seq_output, pooled_output = self.bert(input_ids, token_type_ids, pad_id=pad_id)

        # Project to start and end logits

        logits = self.qa_outputs(seq_output)  # (batch_size, seq_len, 2)

        start_logits = logits[:, :, 0]  # (batch_size, seq_len)

        end_logits = logits[:, :, 1]    # (batch_size, seq_len)

        return start_logits, end_logits

 

# Load SQuAD dataset for question answering

dataset = load_dataset(“squad”)

 

# Load the pretrained BERT tokenizer

TOKENIZER_PATH = “wikitext-2_wordpiece.json”

tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

 

# Setup collate function to tokenize question-context pairs for the model

def collate(batch: list[dict], tokenizer: Tokenizer, max_len: int,

            ) -> tuple[Tensor, Tensor, Tensor, Tensor]:

    “”“Collate question-context pairs for the model.”“”

    cls_id = tokenizer.token_to_id(“[CLS]”)

    sep_id = tokenizer.token_to_id(“[SEP]”)

    pad_id = tokenizer.token_to_id(“[PAD]”)

 

    input_ids_list = []

    token_type_ids_list = []

    start_positions = []

    end_positions = []

 

    for item in batch:

        # Tokenize question and context

        question, context = item[“question”], item[“context”]

        question_ids = tokenizer.encode(question).ids

        context_ids = tokenizer.encode(context).ids

 

        # Build input: [CLS] question [SEP] context [SEP]

        input_ids = [cls_id, *question_ids, sep_id, *context_ids, sep_id]

        token_type_ids = [0] * (len(question_ids)+2) + [1] * (len(context_ids)+1)

 

        # Truncate or pad to max length

        if len(input_ids) > max_len:

            input_ids = input_ids[:max_len]

            token_type_ids = token_type_ids[:max_len]

        else:

            input_ids.extend([pad_id] * (max_len len(input_ids)))

            token_type_ids.extend([1] * (max_len len(token_type_ids)))

 

        # Find answer position in tokens: Answer may not be in the context

        start_pos = end_pos = 0

        if len(item[“answers”][“text”]) > 0:

            answers = tokenizer.encode(item[“answers”][“text”][0]).ids

            # find the context offset of the answer in context_ids

            for i in range(len(context_ids) len(answers) + 1):

                if context_ids[i:i+len(answers)] == answers:

                    start_pos = i + len(question_ids) + 2

                    end_pos = start_pos + len(answers) 1

                    break

            if end_pos >= max_len:

                start_pos = end_pos = 0  # answer is clipped, hence no answer

 

        input_ids_list.append(input_ids)

        token_type_ids_list.append(token_type_ids)

        start_positions.append(start_pos)

        end_positions.append(end_pos)

 

    input_ids_list = torch.tensor(input_ids_list)

    token_type_ids_list = torch.tensor(token_type_ids_list)

    start_positions = torch.tensor(start_positions)

    end_positions = torch.tensor(end_positions)

    return (input_ids_list, token_type_ids_list, start_positions, end_positions)

 

batch_size = 16

max_len = 384  # Longer for Q&A to accommodate context

collate_fn = functools.partial(collate, tokenizer=tokenizer, max_len=max_len)

train_loader = torch.utils.data.DataLoader(dataset[“train”], batch_size=batch_size,

                                           shuffle=True, collate_fn=collate_fn)

val_loader = torch.utils.data.DataLoader(dataset[“validation”], batch_size=batch_size,

                                         shuffle=False, collate_fn=collate_fn)

 

# Create Q&A model with a pretrained foundation BERT model

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

config = BertConfig()

model = BertForQuestionAnswering(config)

model.to(device)

model.bert.load_state_dict(torch.load(“bert_model.pth”, map_location=device))

 

# Training setup

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.AdamW(model.parameters(), lr=2e5)

num_epochs = 3

 

for epoch in range(num_epochs):

    model.train()

    # Training

    with tqdm.tqdm(train_loader, desc=f“Epoch {epoch+1}/{num_epochs}”) as pbar:

        for batch in pbar:

            # get batched data

            input_ids, token_type_ids, start_positions, end_positions = batch

            input_ids = input_ids.to(device)

            token_type_ids = token_type_ids.to(device)

            start_positions = start_positions.to(device)

            end_positions = end_positions.to(device)

            # forward pass

            start_logits, end_logits = model(input_ids, token_type_ids)

            # backward pass

            optimizer.zero_grad()

            start_loss = loss_fn(start_logits, start_positions)

            end_loss = loss_fn(end_logits, end_positions)

            loss = start_loss + end_loss

            loss.backward()

            optimizer.step()

            # update progress bar

            pbar.set_postfix(loss=float(loss))

            pbar.update(1)

 

    # Validation: Keep track of the average loss and accuracy

    model.eval()

    val_loss, num_matches, num_batches, num_samples = 0, 0, 0, 0

    with torch.no_grad():

        for batch in val_loader:

            # get batched data

            input_ids, token_type_ids, start_positions, end_positions = batch

            input_ids = input_ids.to(device)

            token_type_ids = token_type_ids.to(device)

            start_positions = start_positions.to(device)

            end_positions = end_positions.to(device)

            # forward pass on validation data

            start_logits, end_logits = model(input_ids, token_type_ids)

            # compute loss

            start_loss = loss_fn(start_logits, start_positions)

            end_loss = loss_fn(end_logits, end_positions)

            loss = start_loss + end_loss

            val_loss += loss.item()

            num_batches += 1

            # compute accuracy

            pred_start = start_logits.argmax(dim=1)

            pred_end = end_logits.argmax(dim=1)

            match = (pred_start == start_positions) & (pred_end == end_positions)

            num_matches += match.sum().item()

            num_samples += len(start_positions)

 

    avg_loss = val_loss / num_batches

    acc = num_matches / num_samples

    print(f“Validation {epoch+1}/{num_epochs}: acc {acc:.4f}, avg loss {avg_loss:.4f}”)

 

# Save the fine-tuned model

torch.save(model.state_dict(), f“bert_model_squad.pth”)

[ad_2]

Source link


Share Your Thoughts

Post your comment(s), question(s) or thought(s) below.

Tip: See a question you can help with? Feel free to share your knowledge! Learning is better when we help each other out. Your insights could be exactly what a fellow learner needs.

Leave a Reply

  1. Comments

    hello world

    hello world


  2. Comments

    synthroid levothyroxine

    synthroid levothyroxine


  3. Comments

    mebeverine 200mg

    mebeverine 200mg


  4. Comments

    cefixime trihydrate

    cefixime trihydrate


  5. Comments

    lasix water pill

    lasix water pill


  6. Comments

    azithromycin 500 mg dosage

    azithromycin 500 mg dosage


  7. Comments

    doxycycline hyclate 100 mg capsule

    doxycycline hyclate 100 mg capsule


  8. Comments

    levitra generic online

    levitra generic online


  9. Comments

    fluconazole

    fluconazole


  10. Comments

    meloxicam for pain in dogs

    meloxicam for pain in dogs


  11. Comments

    fluconazole for yeast infection over the counter

    fluconazole for yeast infection over the counter


  12. Comments

    tetracycline hydrochloride ointment

    tetracycline hydrochloride ointment


  13. Comments

    buy antibiotics online safely

    buy antibiotics online safely


  14. Comments

    prescription antibiotics

    prescription antibiotics


  15. Comments

    prevacid tablets over the counter

    prevacid tablets over the counter


  16. Comments

    sertraline generic

    sertraline generic


  17. Comments

    amoxicillin for uti

    amoxicillin for uti


  18. Comments

    sertraline hcl

    sertraline hcl


  19. Comments

    medication metoprolol

    medication metoprolol


  20. Comments

    stendra medication

    stendra medication


  21. Comments

    rifampin medication

    rifampin medication


  22. Comments

    dexlansoprazole cost without insurance

    dexlansoprazole cost without insurance


  23. Comments

    doryx 50mg capsule

    doryx 50mg capsule


  24. Comments

    cialis tablet doses

    cialis tablet doses


  25. Comments

    cialis 5mg walmart

    cialis 5mg walmart


  26. Comments

    cenforce 200 reviews

    cenforce 200 reviews


  27. Comments

    cenforce 500

    cenforce 500


  28. Comments

    tadalafil generic brand

    tadalafil generic brand


  29. Comments

    what is udenafil

    what is udenafil


  30. Comments

    dutasteride reddit mtf

    dutasteride reddit mtf


  31. Comments

    finasteride dosage

    finasteride dosage


  32. Comments

    sémaglutide maroc comprimé

    sémaglutide maroc comprimé


  33. Comments

    semaglutida precio similares pluma

    semaglutida precio similares pluma


  34. Comments

    tirzepatide vs semaglutide cost reddit

    tirzepatide vs semaglutide cost reddit


  35. Comments

    fincar 5mg tablet

    fincar 5mg tablet


  36. Comments

    orlistat tablets australia

    orlistat tablets australia


  37. Comments

    semaglutide in voeding

    semaglutide in voeding


  38. Comments

    semaglutid tablett viktnedgång

    semaglutid tablett viktnedgång


  39. Comments

    semaglutid spritze hersteller

    semaglutid spritze hersteller


  40. Comments

    sildenafil tablet amazon

    sildenafil tablet amazon


  41. Comments

    buy vardenafil nz

    buy vardenafil nz


  42. Comments

    ozempic coupon card

    ozempic coupon card


  43. Comments

    does doxycycline affect sleep

    does doxycycline affect sleep