Long Short Term Memory

A recurrent neural network with gates
lstm
gates
Author

Shataxi Dubey

Published

July 15, 2025

Through this notebook, we can easily understand how input gate, output gate, forget gate work in LSTM and how forward propagation and backward propagation happen through various gates in a recurrent neural network.

import torch
import numpy as np
from tqdm import tqdm
cell_state = torch.arange(start = 0, end = 5).reshape((1,5))
hidden_state = torch.arange(start = 10, end = 15).reshape((1,5))
input_gate = torch.arange(start = 0, end = 10).reshape((5,2))
forget_gate = torch.arange(start = 10, end = 20).reshape((5,2))
concatenated_input = torch.concatenate([hidden_state, cell_state], axis = 1)
concatenated_weights = torch.concatenate([input_gate, forget_gate], axis = 0)
concatenated_input@concatenated_weights
tensor([[420, 490]])
cell_state@forget_gate + hidden_state@input_gate
tensor([[420, 490]])
def oneHotEncode(data, num_chars, chr_to_idx):
    all_one_hots = []
    for chr in data:
        onehot = np.zeros((num_chars,1))
        idx = chr_to_idx[chr]
        onehot[idx] = 1
        all_one_hots.append(onehot)
    
    return all_one_hots

Xavier initialisation

Uniform xavier initialisation

draw weight from a random uniform distribution [-x, x]

x = sqrt(6/(input_size + output_size))

def initWeights(input_size, output_size):  # input size = hidden size, output size = hidden size + num_of_unique_chars 
    x = np.sqrt(6 / (input_size + output_size))
    w = np.random.uniform(low = -x, high = x, size = (input_size, output_size)) 
    return w


def sigmoid(input):
    input = np.clip(input, 1e-5, 1e5)
    sig_out = 1 / (1 + np.exp(-input)) 
    return sig_out

def tanh(input):
    return np.tanh(input)

def derivative_sigmoid(input):
    return input*(1 - input)

def derivative_tanh(input):
    return 1 - (input**2)

def softmax(input):
    return np.exp(input)/ sum(np.exp(input))

class LSTM():
    def __init__(self, input_size, hidden_size, output_size, learning_rate, num_epochs):

        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.input_size = input_size
        self.hidden_size = hidden_size

        # forget gate weights(it is multiplied with the cell state t-1)
        self.wf = initWeights(hidden_size, input_size)
        self.bf = initWeights(hidden_size, 1)

        # input gate weights
        self.wi = initWeights(hidden_size, input_size)
        self.bi = initWeights(hidden_size, 1)

        # output gate weights
        self.wo = initWeights(hidden_size, input_size)
        self.bo = initWeights(hidden_size, 1)

        # candidate gate weights
        self.wc = initWeights(hidden_size, input_size)
        self.bc = initWeights(hidden_size, 1)

        # final layer weights
        self.wy = initWeights(output_size, hidden_size)
        self.by = initWeights(output_size, 1)

    
    def reset(self):
        self.hidden_state = {-1: np.zeros((self.hidden_size, 1))}
        self.cell_state = {-1: np.zeros((self.hidden_size, 1))}
        self.concatenated_input = {}
        self.forget_gate = {}
        self.input_gate = {}
        self.output_gate = {}
        self.intermediate_cell_state = {}

    def forward(self, inputs): 
        self.reset()

        output = []
        for idx in range(len(inputs)):
            self.concatenated_input[idx] = np.concatenate([self.hidden_state[idx - 1], inputs[idx]]) # shape of concatenated input: (hidden_size + char_size, 1)
            self.forget_gate[idx] = sigmoid(self.wf@self.concatenated_input[idx] + self.bf) # shape of forget gate: (hidden_size, 1)
            self.input_gate[idx] = sigmoid(self.wi@self.concatenated_input[idx] + self.bi) # shape of input gate: (hidden_size, 1)
            self.output_gate[idx] = sigmoid(self.wo@self.concatenated_input[idx] + self.bo) # shape of output gate: (hidden_size, 1)
            self.intermediate_cell_state[idx] = tanh(self.wc@self.concatenated_input[idx] + self.bc) # shape of intermediate cell state: (hidden_size, 1)
            
            self.cell_state[idx] = (np.multiply(self.intermediate_cell_state[idx], self.input_gate[idx]) + np.multiply(self.cell_state[idx - 1], self.forget_gate[idx])) # shape of cell state: (hidden size, 1)
            self.hidden_state[idx] = np.multiply(tanh(self.cell_state[idx]), self.output_gate[idx]) # shape of hidden state: (hidden_size, 1)
            
            output += [self.wy @ self.hidden_state[idx] + self.by]  # The final output is computed from the hidden state
            # shape of output: (char_size, 1)

        return output
    
    def train(self, inputs, labels, chr_to_idx):

        for _ in tqdm(range(self.num_epochs)):
            outputs = self.forward(inputs)

            errors = []
            for idx in range(len(outputs)):
                errors += [softmax(outputs[idx])] 
                errors[-1][chr_to_idx[labels[idx]]] -= 1 # Here we compute (y_hat - 1) which is the gradient of loss with respect to z where y_hat = softmax(z)

            
            self.backward(errors, outputs)
    

    def backward(self, errors, outputs):

        d_wy, d_by = 0, 0
        d_wo, d_bo = 0, 0
        d_wc, d_bc = 0, 0
        d_wi, d_bi = 0, 0
        d_wf, d_bf = 0, 0

        dh_next, dc_next = np.zeros((self.hidden_size, 1)), np.zeros((self.hidden_size, 1)) 

        for idx in range(len(outputs) - 1, 0, -1):
            # gradient wrt final gate
            d_wy += errors[idx] @ self.hidden_state[idx].T     # shape of error: (char_size, 1), shape of hidden_state: (hidden_size, 1), shape of wy: (char_size, hidden_size)
            d_by += errors[idx]  # shape of d_by: (char_size, 1)

            # gradient wrt hidden state
            d_h = self.wy.T @ errors[idx] + dh_next  # shape of error: (char_size, 1), shape of wy (char_size, hidden_size) shape of d_h: (hidden_size, 1)


            # gradient wrt output gate
            d_o = tanh(self.cell_state[idx]) * derivative_sigmoid(self.output_gate[idx]) * d_h # shape of cell state: (hidden_size, 1), shape of output gate: (hidden_size, 1) shape of d_o: (hidden_size, 1)
            d_wo += d_o @ self.concatenated_input[idx].T # shape of concatenated_input: (char_size + hidden_size, 1), shape of d_o: (hidden_size, 1), shape of w_o: (hidden_size, char_size + hidden_size), shape of d_wo: (hidden_size, char_size + hidden_size)
            d_bo += d_o # shape of d_bo: (hidden_size, 1)

            # gradient wrt cell state
            d_cs = self.output_gate[idx] * derivative_tanh(tanh(self.cell_state[idx])) * d_h + dc_next # shape of output gate: (hidden_size, 1), shape of cell_state: (hidden_size, 1), shape of d_h: (hidden_size, 1), shape of d_cs: (hidden_size, 1)

            # gradient wrt candidate gate
            d_candidate = self.input_gate[idx] * derivative_tanh(self.intermediate_cell_state[idx]) * d_cs # shape of input gate: (hidden_size, 1), shape of intermediate_cell_state: (hidden_size, 1), shape of d_cs: (hidden_size, 1), shape of d_candidate: (hidden_size, 1)
            d_wc += d_candidate @ self.concatenated_input[idx].T # shape of concatenated_input: (char_size + hidden_size, 1), shape of d_candidate: (hidden_size, 1), shape of d_wc: (hidden_size, hidden_size + char_size)
            d_bc += d_candidate # shape of d_candidate: (hidden_size, 1), shape of d_bc: (hidden_size, 1)

            # gradient wrt input gate
            d_i = self.intermediate_cell_state[idx] * derivative_sigmoid(self.input_gate[idx]) * d_cs # shape of intermediate_cell_state: (hidden_size, 1), shape of input_gate: (hidden_size, 1), shape of d_cs: (hidden_size, 1), shape of d_i: (hidden_size, 1)
            d_wi += d_i @ self.concatenated_input[idx].T # shape of concatenated_input: (hidden_size+ chr_size, 1), shape of d_i: (hidden_size, 1), shape of d_wi; (hidden_size, hidden_size+ chr_size)
            d_bi += d_i # shape of d_i: (hidden_size, 1), shape of d_bi: (hidden_size, 1)

            # gradient wrt forget gate
            d_f = self.cell_state[idx-1] * derivative_sigmoid(self.forget_gate[idx]) * d_cs # shape of cell state: (hidden_size, 1), shape of forget gate: (hidden_size, 1), shape of d_cs: (hidden_size, 1)
            d_wf += d_f @ self.concatenated_input[idx].T # shape of concatenated_input: (hidden_size+ chr_size, 1), shape of d_f: (hidden_size, 1), shape of d_wf: (hidden_size, hidden_size+ chr_size)
            d_bf += d_f # shape of d_f: (hidden_size, 1), shape of d_bf: (hidden_size, 1)

            # gradient wrt concatenated input error
            d_z = self.wf.T @ d_f + self.wi.T @ d_i + self.wo.T @ d_o + self.wc.T @ d_cs # shape of d_z: (hidden_size + chr_size , 1)
            
            # gradient wrt hidden state at next time step
            dh_next = d_z[: self.hidden_size, :]
            # gradient wrt cell states at next time step
            dc_next = self.forget_gate[idx]*d_cs
            

        for x in (d_wy, d_by, d_wf, d_bf, d_wc, d_bc, d_wi, d_bi, d_wo, d_bo):
            x = np.clip(x, -1, 1)

        self.wf = self.wf - self.learning_rate * d_wf
        self.bf = self.bf - self.learning_rate * d_bf

        self.wi = self.wi - self.learning_rate * d_wi
        self.bi = self.bi - self.learning_rate * d_bi

        self.wo = self.wo - self.learning_rate * d_wo
        self.bo = self.bo - self.learning_rate * d_bo

        self.wc = self.wc - self.learning_rate * d_wc
        self.bc = self.bc - self.learning_rate * d_bc

        self.wy = self.wy - self.learning_rate * d_wy
        self.by = self.by - self.learning_rate * d_by


    
    def inference(self, input, idx_to_chr):
        outputs = self.forward(input)
        characters = ''
        for output in outputs:
            probs = softmax(output)
            idx = np.argmax(probs)
            # print(idx)
            characters += (idx_to_chr[idx])

        return characters
##### Data #####
data = """To be, or not to be, that is the question: Whether \
'tis nobler in the mind to suffer The slings and arrows of ou\
trageous fortune, Or to take arms against a sea of troubles A\
nd by opposing end them. To die—to sleep, No more; and by a s\
leep to say we end The heart-ache and the thousand natural sh\
ocks That flesh is heir to: 'tis a consummation Devoutly to b\
e wish'd. To die, to sleep; To sleep, perchance to dream—ay, \
there's the rub: For in that sleep of death what dreams may c\
ome, When we have shuffled off this mortal coil, Must give us\
 pause—there's the respect That makes calamity of so long lif\
e. For who would bear the whips and scorns of time, Th'oppres\
sor's wrong, the proud man's contumely, The pangs of dispriz'\
d love, the law's delay, The insolence of office, and the spu\
rns That patient merit of th'unworthy takes, When he himself \
might his quietus make""".lower()


chars = set(data)
num_chars = len(chars)

chr_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_chr = {i: c for i, c in enumerate(chars)}

trainX, trainY = data[:-1], data[1:]

char_size = len(chars)
print(f'Unique characters {char_size}, total training inputs {len(trainX)}')
hidden_size = 10

learning_rate, num_epochs = 0.01, 1000

trainX = oneHotEncode(trainX, num_chars, chr_to_idx)

lstm = LSTM(char_size + hidden_size, hidden_size, char_size, learning_rate, num_epochs)
Unique characters 32, total training inputs 865
outputs = lstm.forward(trainX)
len(outputs[0])
32
len(outputs)
865
lstm.train(trainX, trainY, chr_to_idx)
  3%|▎         | 28/1000 [00:04<02:23,  6.78it/s]C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:117: RuntimeWarning: overflow encountered in matmul
  d_z = self.wf.T @ d_f + self.wi.T @ d_i + self.wo.T @ d_o + self.wc.T @ d_cs # shape of d_z: (hidden_size + chr_size , 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:117: RuntimeWarning: overflow encountered in add
  d_z = self.wf.T @ d_f + self.wi.T @ d_i + self.wo.T @ d_o + self.wc.T @ d_cs # shape of d_z: (hidden_size + chr_size , 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:117: RuntimeWarning: invalid value encountered in add
  d_z = self.wf.T @ d_f + self.wi.T @ d_i + self.wo.T @ d_o + self.wc.T @ d_cs # shape of d_z: (hidden_size + chr_size , 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:94: RuntimeWarning: invalid value encountered in multiply
  d_o = tanh(self.cell_state[idx]) * derivative_sigmoid(self.output_gate[idx]) * d_h # shape of cell state: (hidden_size, 1), shape of output gate: (hidden_size, 1) shape of d_o: (hidden_size, 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:95: RuntimeWarning: invalid value encountered in matmul
  d_wo += d_o @ self.concatenated_input[idx].T # shape of concatenated_input: (char_size + hidden_size, 1), shape of d_o: (hidden_size, 1), shape of w_o: (hidden_size, char_size + hidden_size), shape of d_wo: (hidden_size, char_size + hidden_size)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:99: RuntimeWarning: invalid value encountered in multiply
  d_cs = self.output_gate[idx] * derivative_tanh(tanh(self.cell_state[idx])) * d_h + dc_next # shape of output gate: (hidden_size, 1), shape of cell_state: (hidden_size, 1), shape of d_h: (hidden_size, 1), shape of d_cs: (hidden_size, 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:102: RuntimeWarning: invalid value encountered in multiply
  d_candidate = self.input_gate[idx] * derivative_tanh(self.intermediate_cell_state[idx]) * d_cs # shape of input gate: (hidden_size, 1), shape of intermediate_cell_state: (hidden_size, 1), shape of d_cs: (hidden_size, 1), shape of d_candidate: (hidden_size, 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:107: RuntimeWarning: invalid value encountered in multiply
  d_i = self.intermediate_cell_state[idx] * derivative_sigmoid(self.input_gate[idx]) * d_cs # shape of intermediate_cell_state: (hidden_size, 1), shape of input_gate: (hidden_size, 1), shape of d_cs: (hidden_size, 1), shape of d_i: (hidden_size, 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:108: RuntimeWarning: invalid value encountered in matmul
  d_wi += d_i @ self.concatenated_input[idx].T # shape of concatenated_input: (hidden_size+ chr_size, 1), shape of d_i: (hidden_size, 1), shape of d_wi; (hidden_size, hidden_size+ chr_size)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:112: RuntimeWarning: invalid value encountered in multiply
  d_f = self.cell_state[idx-1] * derivative_sigmoid(self.forget_gate[idx]) * d_cs # shape of cell state: (hidden_size, 1), shape of forget gate: (hidden_size, 1), shape of d_cs: (hidden_size, 1)
C:\Users\Hp\AppData\Local\Temp\ipykernel_13924\2832247213.py:113: RuntimeWarning: invalid value encountered in matmul
  d_wf += d_f @ self.concatenated_input[idx].T # shape of concatenated_input: (hidden_size+ chr_size, 1), shape of d_f: (hidden_size, 1), shape of d_wf: (hidden_size, hidden_size+ chr_size)
100%|██████████| 1000/1000 [02:25<00:00,  6.88it/s]
output = lstm.inference(trainX, idx_to_chr)
print(output)
len(output)
:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
865

What is the error in case of LSTM?

Ans: outputs is a vector of size char_size

On applying softmax to the output vector, we get a tensor that contains probabilities.

We calculate the cross-entropy loss: \(-\sum{y_ilog\hat{y_i}}\) —- Please check here does cross entropy loss include negation or not

If input x at timestep t belongs to class \(l\) then \(y_l\) = 1 and cross-entropy loss \(L(\theta) = -log(\hat{y_l})\)

\(\frac{dL(\theta)}{d\hat{y_l}} = \frac{-1}{\hat{y_l}}\)

\(\hat{y} = softmax(z)\)

\(\hat{y} = (\hat{y_1}, \hat{y_2}, \hat{y_3}, .......... \hat{y_L})\) where L is number of unique characters

\(\frac{d\hat{y_l}}{dz} = 1_{i = l}*softmax(z_l) - softmax(z_l)softmax(z_i) = 1_{i=l}* \hat{y_l} - \hat{y_l}*\hat{y_i} = \hat{y_l} (1_{i = l} - \hat{y_i})\)

\(\frac{dL(\theta)}{dz} = \frac{-1}{\hat{y_l}} * \hat{y_l} (1_{i = l} - \hat{y_i}) = (\hat{y_i} - 1)\) This vector is of size L

References:

  • https://github.com/CallMeTwitch/Neural-Network-Zoo/blob/main/LongShortTermMemoryNetwork.py