import React from 'react';
import '../../styles/subsection.css';
import Header from '../../components/Header';
import Footer from '../../components/Footer';
import { Link } from 'react-router-dom';
import 'katex/dist/katex.min.css';
import { InlineMath, BlockMath } from 'react-katex';
import { LightAsync as SyntaxHighlighter } from 'react-syntax-highlighter';
import { docco } from 'react-syntax-highlighter/dist/esm/styles/hljs';
import rnn from '../../media/RNNs/rnn_towards_ai.png';
import lstm from '../../media/RNNs/lstm.png';

function RNN() {
    return (
        <div className="subsubsection-container">
            <Header />
            <div class="side-nav-container">
                <aside className="subsubsection-side-nav">
                    <a href="#rnn">RNNs</a>
                    <a href="#lstm">LSTM</a>
                    <a href="#gru">Gated Recurrent Units</a>
                    <a href="#bi">Bi-Directional RNNs</a>
                </aside>
            </div>
            
            <main className="subsubsection-content">
                <div className="titles"><h1>Recurrent Neural Networks</h1></div>

                <section id="rnn" className="code-cleaned">
                <h2>Introduction to RNNs</h2>
                <p className="subsubsection-paragraph">
                    Recurrent Neural Networks (RNNs) are a class of artificial neural networks designed for handling sequential data. Unlike traditional feedforward neural networks, RNNs 
                    possess a unique feature: they have internal loops allowing information to persist, making them ideal for tasks where context and order matter such as NLP tasks.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Language modeling</span>,
                                    <span style={{ color: '#008000' }}> Text generation</span>,
                                    <span style={{ color: '#ff4500' }}> Sentiment analysis</span>,
                                    <span style={{ color: '#1e90ff' }}> Sequence labeling (e.g., part-of-speech tagging)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.SimpleRNN)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.RNN)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Typically <span>O(t*n^2)</span>, where <i>t</i> is the length of the input sequence and <i>n</i> is the number of hidden units
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Learning representations by back-propagating errors"</span> by Rumelhart, Hinton, Williams, 1986; foundational for understanding the backpropagation training technique in neural networks, including RNNs
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>RNN Foundations</h4>
                <p className="subsubsection-paragraph">
                    The essence of RNNs lies in their unique ability to process sequences by leveraging a form of memory. Unlike traditional neural networks, 
                    where inputs are treated independently, RNNs use their internal state (memory) to process sequences of inputs. This is achieved through a mechanism of recurrence, where 
                    the network's hidden state at any time step <InlineMath math="t" /> is influenced not just by the current input <InlineMath math="x_t" />, but also by the previous hidden
                     state <InlineMath math="h_{t-1}" />. The recurrent nature of RNNs is mathematically expressed as:
                    <BlockMath math="h_t = \sigma(W_{hh}h_{t-1} + W_{xh}x_t + b)" />
                    In this equation, <InlineMath math="h_t" /> is the hidden state at time <InlineMath math="t" />, <InlineMath math="\sigma" /> denotes a nonlinear activation function such as
                     tanh or ReLU, which introduces the necessary nonlinearity to the model allowing it to learn complex patterns. <InlineMath math="W_{hh}" /> and <InlineMath math="W_{xh}" /> are
                      the weight matrices for the hidden-to-hidden and input-to-hidden connections, respectively, and <InlineMath math="b" /> represents the bias. The recurrence formula allows the
                       RNN to effectively 'remember' information across time steps, making it particularly suited for tasks where context and temporal order are important.
                </p>

                <p className="subsubsection-paragraph">
                The operation of a Recurrent Neural Network at each time step can be broken down into two primary activities: updating the hidden state and
                      producing an output. At each 
                    time step <InlineMath math="t" />, the RNN takes in the current input <InlineMath math="x_t" /> and the previous hidden state <InlineMath math="h_{t-1}" />, combining them 
                    to update the current hidden state <InlineMath math="h_t" />. This update is governed by the network's weights and the chosen activation function, as described by the 
                    previous equation. The updated hidden state <InlineMath math="h_t" /> encapsulates the information from the current 
                    input and all preceding inputs in the sequence, serving as a compact summary of the sequence information up to time <InlineMath math="t" />. Depending on the specific 
                    architecture and task, the RNN can then produce an output <InlineMath math="y_t" /> at each time step, which might be based on the current hidden 
                    state <InlineMath math="h_t" />, or only produce an output at the final time step, reflecting the aggregated information from the entire sequence.
                    
                    <figure className="flex-container-caption">
                        <div className="flex-container"><img src={rnn} alt="Broken" className="image-medium"/></div>
                        <figcaption>The typical RNN architecture; wrapped and unwrapped. The hidden state is updated as we move along the continuum until the final element of the sequence is processed; <a href="https://pub.towardsai.net/whirlwind-tour-of-rnns-a11effb7808f" target="_blank" rel="noopener noreferrer">image source</a>.</figcaption>
                        </figure>
                    </p>



                    <p className="subsubsection-paragraph">
                    
                    Consider an RNN designed for a simple NLP task: processing the word "cat" one letter at a time and assume that the entirety of the English alphabet is 3 letters ("c", "a", and "t"). This example will detail each step the RNN
                     takes to update its hidden state upon 
                    receiving each character in the sequence.
                    <ol className="step-list">
                    <li>
                        <strong>Initialization:</strong> The initial hidden state <InlineMath math="h_0" /> is set to <InlineMath math="[0, 0, 0]" />, assuming the RNN has 3 neurons in its 
                        hidden layer, then hidden weight matrix here will be 3 x 3. This initialization represents the lack of prior information before processing the sequence.
                    </li>
                    <br />
                    <li>
                        <strong>Weight Matrices and Biases:</strong> Let's define the weight matrices and biases for the RNN:
                        
                        <ul>
                        <br />
                            <li><div className="custom-math-size"><InlineMath math="W_{xh} = \begin{bmatrix} 0.1 & 0.2 & 0.3 \\ 0.4 & 0.5 & 0.6 \\ 0.7 & 0.8 & 0.9 \end{bmatrix}" /></div><br /> the input-to-hidden weight matrix.</li><br />
                            <li><div className="custom-math-size"><InlineMath math="W_{hh} = \begin{bmatrix} 0.2 & 0.2 & 0.2 \\ 0.3 & 0.3 & 0.3 \\ 0.4 & 0.4 & 0.4 \end{bmatrix}" /></div><br /> the hidden-to-hidden weight matrix for the layer.</li><br />
                            <li><div className="custom-math-size"><InlineMath math="b_h = [0.01, 0.01, 0.01]" /></div><br /> the bias vector for the hidden layer.</li><br />
                        </ul>
                    </li>
                    <br />
                    <li>
                        <strong>Encoding "c":</strong> The character 'c' generally would be encoded as a one-hot vector <InlineMath math="x_1 = [0, 0, 1, 0, \ldots, 0]" />, where the '1' indicates the position of
                         'c' in the alphabet. In our case, we're keeping it simple so we'll position it as [0, 0, 1].
                    </li>
                    <br />
                    <li>
                        <strong>Calculating <InlineMath math="h_1" />:</strong> The RNN updates its hidden state <InlineMath math="h_1" /> based on <InlineMath math="x_1" />, <InlineMath math="h_0" />, and the network parameters:
                        <BlockMath math="h_1 = W_{xh} \cdot x_1 + W_{hh} \cdot h_0 + b_h" />
                        Substituting the values, the calculation for <InlineMath math="h_1" /> becomes:
                        <div className="custom-math-size"><BlockMath math="h_1 = \begin{bmatrix} 0.1 & 0.2 & 0.3 \\ 0.4 & 0.5 & 0.6 \\ 0.7 & 0.8 & 0.9 \end{bmatrix} \cdot \begin{bmatrix} 0 \\ 0 \\ 1 \end{bmatrix} + \begin{bmatrix} 0.2 & 0.2 & 0.2 \\ 0.3 & 0.3 & 0.3 \\ 0.4 & 0.4 & 0.4 \end{bmatrix} \cdot \begin{bmatrix} 0 \\ 0 \\ 0 \end{bmatrix} + \begin{bmatrix} 0.01 \\ 0.01 \\ 0.01 \end{bmatrix} = \begin{bmatrix} 0.31 \\ 0.61 \\ 0.91 \end{bmatrix}" /></div>
                    </li>
                    <br />
                    <li>
                        <strong>Encoding "a":</strong> The character 'a' is encoded as <InlineMath math="x_2 = [1, 0, 0]" />, where the '1' represents 'a'.
                    </li>
                    <br />
                    <li>
                        <strong>Calculating <InlineMath math="h_2" />:</strong> The new hidden state <InlineMath math="h_2" /> is computed using <InlineMath math="x_2" /> and <InlineMath math="h_1" />:
                        <BlockMath math="h_2 = W_{xh} \cdot x_2 + W_{hh} \cdot h_1 + b_h" />
                        Substituting the values for <InlineMath math="x_2" /> and <InlineMath math="h_1" />, the computation yields:
                        <div className="custom-math-size"><BlockMath math="h_2 = \begin{bmatrix} 0.1 & 0.2 & 0.3 \\ 0.4 & 0.5 & 0.6 \\ 0.7 & 0.8 & 0.9 \end{bmatrix} \cdot \begin{bmatrix} 1 \\ 0 \\ 0 \end{bmatrix} + \begin{bmatrix} 0.2 & 0.2 & 0.2 \\ 0.3 & 0.3 & 0.3 \\ 0.4 & 0.4 & 0.4 \end{bmatrix} \cdot \begin{bmatrix} 0.31 \\ 0.61 \\ 0.91 \end{bmatrix} + \begin{bmatrix} 0.01 \\ 0.01 \\ 0.01 \end{bmatrix} = \begin{bmatrix} 0.36 \\ 0.68 \\ 1.00 \end{bmatrix}" /></div>
                    </li>
                </ol>
                </p>

                <p className="subsubsection-paragraph">
                    Through these steps, the RNN processes each character in "cat", updating its hidden state to integrate the information conveyed by each character sequentially. 
                    This detailed example demonstrates how specific weight matrices and biases influence the RNN's computations at each step.
                </p>


                <h4>RNN Architectures</h4>
                {/* <p className="subsubsection-paragraph">
                    The architecture of a Recurrent Neural Network is designed to recognize patterns in sequences of data. The network comprises three main layers: the input layer, the 
                    hidden (recurrent) layer, and the output layer. The input layer receives the data one time step at a time, passing it to the hidden layer. The hidden layer is the core of 
                    the RNN, where the recurrence mechanism takes place. At each time step, the hidden layer updates its state based on both the current input from the input layer and its own 
                    previous state, a process described by the previously seen recurrence relation <BlockMath math="h_t = \sigma(W_{hh}h_{t-1} + W_{xh}x_t + b)" /> This allows the network to maintain
                     a 'memory' 
                    of all previous inputs in its hidden state, enabling it to exhibit dynamic temporal behavior. The output layer then uses the state of the hidden layer to generate the output for
                     the current time step, which can be a prediction, a classification, or any other form of output relevant to the task at hand. The unique aspect of the RNN's architecture is its 
                     use of shared weights (<InlineMath math="W_{hh}" /> and <InlineMath math="W_{xh}" />) across all time steps, which not only reduces the model complexity but also enables it to 
                     process input sequences of any length.
                </p> */}

                <p className="subsubsection-paragraph">
                In a Deep Recurrent Neural Network with multiple hidden layers, each layer has its own recurrent connections, characterized by a unique weight matrix <InlineMath math="W_{hh}" />. This setup 
                allows each layer to capture different aspects of the temporal information in the data, adding depth to the model's learning capability. Let's consider a stacked RNN designed for processing sequential data, with 
                two hidden layers for illustration:

                <ol className="step-list">
                    <li>
                        <strong>First Hidden Layer:</strong> This layer directly processes the input sequence. Its hidden state at time step <InlineMath math="t" />, denoted <InlineMath math="h_t^{(1)}" />, is updated 
                        based on the input at the same time step <InlineMath math="x_t" /> and its previous hidden state <InlineMath math="h_{t-1}^{(1)}" />, using its own <InlineMath math="W_{hh}^{(1)}" /> matrix:
                        <BlockMath math="h_t^{(1)} = \sigma(W_{xh}^{(1)} \cdot x_t + W_{hh}^{(1)} \cdot h_{t-1}^{(1)} + b_h^{(1)})" />
                    </li>
                    <li>
                        <strong>Second Hidden Layer:</strong> This layer receives the output of the first hidden layer as its input. The hidden state <InlineMath math="h_t^{(2)}" /> at time step <InlineMath math="t" /> is 
                        updated based on the output of the first hidden layer at the same time step <InlineMath math="h_t^{(1)}" /> and its own previous hidden state <InlineMath math="h_{t-1}^{(2)}" />, using its 
                        own <InlineMath math="W_{hh}^{(2)}" /> matrix:
                        <BlockMath math="h_t^{(2)} = \sigma(W_{hh}^{(2)} \cdot h_{t-1}^{(2)} + W_{hh}^{(1,2)} \cdot h_t^{(1)} + b_h^{(2)})" />
                        Here, <InlineMath math="W_{hh}^{(1,2)}" /> represents the weights for connections from the first hidden layer to the second hidden layer.
                    </li>
                </ol>
                Each <InlineMath math="W_{hh}" />  matrix governs the flow of information within its respective layer, allowing the RNN to build a hierarchical representation of the data. The first layer might capture basic patterns, 
                while subsequent layers can extract more abstract features.
                </p>


                <h4>RNN Gradients</h4>
                <p className="subsubsection-paragraph">
                    A significant challenge in training Recurrent Neural Networks (RNNs) is dealing with vanishing and exploding gradients, phenomena that arise during the backpropagation 
                    through time (BPTT) process. BPTT is an extension of the standard backpropagation algorithm, tailored for RNNs to handle their sequential nature. It involves unrolling 
                    the RNN across time steps and applying backpropagation at each of these steps. However, due to the repeated multiplication of gradients through the network's recurrent
                     connections, the gradients can either shrink (vanish) or grow (explode) exponentially as they propagate backward through time.
                </p>
                <p className="subsubsection-paragraph">
                    <strong>Vanishing Gradients:</strong> When gradients vanish, they become so small that the weights in the early layers of the RNN hardly update, making it nearly impossible for
                     the model to learn long-range dependencies within the sequence. This issue is particularly pronounced with activation functions like the hyperbolic tangent (tanh) or the sigmoid
                      function, where the gradient can become extremely small, effectively stopping the network from learning.
                </p>
                <p className="subsubsection-paragraph">
                    <strong>Exploding Gradients:</strong> Conversely, exploding gradients result in very large updates to weights, causing the model to diverge and become unstable. This can happen
                     when the gradients accumulate across long sequences, magnifying any small changes to the point where they overwhelm the model's parameters.
                </p>

                <p className="subsubsection-paragraph">
                    Consider a simple RNN with a sigmoid activation function, processing a sequence of length 4. Assume the weight matrix for the hidden-to-hidden 
                    connections (<InlineMath math="W_{hh}" />) has a value of 0.5, and the activation function is sigmoid, which has a maximum derivative of 0.25. During backpropagation, the
                     gradient of the loss with respect to the hidden state at each time step is multiplied by <InlineMath math="W_{hh}" /> and the derivative of the sigmoid function. 

                     <ol className="step-list">
                    <li>
                        Starting from the last time step, the gradient is <InlineMath math="\frac{\partial L}{\partial h_4}" />. As we move back to the third time step, this gradient is multiplied by <InlineMath math="W_{hh}" /> and the derivative of the sigmoid, effectively scaling it by 0.5 * 0.25 = 0.125.
                    </li>
                    <li>
                        Moving to the second time step, the gradient <InlineMath math="\frac{\partial L}{\partial h_3}" /> is further scaled down by 0.125, becoming 0.125^2.
                    </li>
                    <li>
                        By the time we reach the first time step, the gradient <InlineMath math="\frac{\partial L}{\partial h_1}" /> has been scaled by 0.125^3, which is a very small number, diminishing the gradient's effect on the weight update.
                    </li>
                </ol>
                </p>

                {/* <p className="subsubsection-paragraph">
                    This step-by-step reduction illustrates how the gradient can vanish as it is propagated back through time, making it challenging for the RNN to learn from inputs that occurred
                     earlier in the sequence.
                </p> */}

                <p className="subsubsection-paragraph">
                    Various strategies have been developed to mitigate these issues, including gradient clipping, which caps the gradients during backpropagation to prevent them from exploding, 
                    and the use of gating mechanisms in advanced RNN variants like Long Short-Term Memory networks and Gated Recurrent Units. These mechanisms introduce a way to control
                     the flow of information and gradients through the network, making it easier for the model to capture long-term dependencies without suffering from vanishing or exploding gradients.
                </p>

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Hidden Layer Size:</strong>  Determines the number of neurons in the hidden layer. A larger hidden layer increases the model's capacity to store and process information, 
                        allowing it to capture more complex patterns in the data. However, it also raises the risk of overfitting and increases computational requirements.
                        </p>
                    </li>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Activation Function:</strong>  Influences the non-linearity introduced at each neuron. Common choices include ReLU, tanh, and sigmoid.
                         The activation function affects the gradient flow during backpropagation and can impact the network's ability to learn complex patterns. For example,
                          tanh and sigmoid are traditionally used in RNNs, but they can lead to vanishing gradient problems.
                        </p>
                    </li>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Learning Rate:</strong>  Controls the step size during weight updates in the training process. A higher learning rate accelerates learning but can overshoot minima,
                         while a lower learning rate ensures more stable convergence but can slow down the training process.
                        </p>
                    </li>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Sequence Length for BPTT:</strong> Specifies the number of time steps for which the RNN is unrolled during backpropagation through time. Longer sequences
                         provide more context but increase computational complexity and can exacerbate vanishing or exploding gradient issues. Truncated BPTT can be used to mitigate these 
                         problems by limiting the number of steps the gradient is propagated backward.
                        </p>
                    </li>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Weight Initialization:</strong> The method used to initially set the weights of the RNN can significantly affect its training dynamics and final performance.
                         Proper initialization can prevent early vanishing or exploding gradients and help ensure that the network starts in a good region of the parameter space.
                        </p>
                    </li>
                    <li>
                        
                        <p className="subsubsection-paragraph">
                        <strong>Regularization Techniques:</strong> Techniques such as dropout, L1/L2 regularization, and early stopping are used to prevent overfitting, ensuring that the
                         model generalizes well to unseen data. In RNNs, dropout is often applied selectively to non-recurrent connections to preserve the network's ability to learn long-term dependencies.
                        </p>
                    </li>
                </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Here's a Python example using TensorFlow to construct an RNN for a simple time series prediction task:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
import numpy as np
import matplotlib.pyplot as plt

# Generate synthetic time series data
def generate_time_series(batch_size, n_steps):
    freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
    time = np.linspace(0, 1, n_steps)
    series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10))  # wave 1
    series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20)) # wave 2
    series += 0.1 * (np.random.rand(batch_size, n_steps) - 0.5)   # noise
    return series[..., np.newaxis].astype(np.float32)

# Prepare the data
n_steps = 50
series = generate_time_series(10000, n_steps)
X_train, y_train = series[:7000, :n_steps], series[:7000, -1]
X_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -1]
X_test, y_test = series[9000:, :n_steps], series[9000:, -1]

# Define the RNN model
model = Sequential([
    SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    SimpleRNN(20),
    Dense(1)
])

model.compile(loss="mean_squared_error", optimizer="adam")
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

# Plotting the results
def plot_learning_curves(loss, val_loss):
    plt.plot(np.arange(len(loss)) + 0.5, loss, "b.-", label="Training loss")
    plt.plot(np.arange(len(val_loss)) + 1, val_loss, "r.-", label="Validation loss")
    plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    plt.axis([1, 20, 0, 0.025])
    plt.legend(fontsize=14)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.grid(True)

plot_learning_curves(history.history["loss"], history.history["val_loss"])
plt.show()`}
                        </SyntaxHighlighter>
                    </p>
                </section>


                
                <section id="lstm" className="code-cleaned">
                <h2>Long Short-Term Memory (LSTM) Networks</h2>
                <p className="subsubsection-paragraph">
                    Long Short-Term Memory networks (LSTMs) are a sophisticated variant of RNNs designed to learn long-term dependencies and overcome the vanishing gradient problem inherent 
                    in traditional RNNs. They can be excellent at language translation or any such task that typically has data that is long sequences.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Sequence to sequence modeling</span>,
                                    <span style={{ color: '#008000' }}> Time series prediction</span>,
                                    <span style={{ color: '#ff4500' }}> Sentiment analysis</span>,
                                    <span style={{ color: '#1e90ff' }}> Machine translation</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.LSTM)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.LSTM)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Typically <span>O(t*n^2)</span>, where <i>t</i> is the length of the input sequence and <i>n</i> is the number of hidden units; LSTMs have a higher constant factor due to their more complex cell structure compared to vanilla RNNs
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Long Short-Term Memory"</span> by Hochreiter & Schmidhuber, 1997; introduced the LSTM architecture as a solution to the vanishing gradient problem in recurrent neural networks
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>LSTM Foundations</h4>
                <p className="subsubsection-paragraph">
                    The architecture of LSTM units is composed of a cell state and three types of gates: input (<InlineMath math="i_t" />), forget (<InlineMath math="f_t" />), and 
                    output (<InlineMath math="o_t" />) gates. Each gate in the LSTM architecture has a specific role 
                    in regulating the flow of information. The mathematical formulations for these gates are as follows:
                    <BlockMath math="f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f)" />
                    <BlockMath math="i_t = \sigma(W_i \cdot [h_{t-1}, x_t] + b_i)" />
                    <BlockMath math="o_t = \sigma(W_o \cdot [h_{t-1}, x_t] + b_o)" />
                    <BlockMath math="\tilde{C}_t = \tanh(W_C \cdot [h_{t-1}, x_t] + b_C)" />
                    <BlockMath math="C_t = f_t * C_{t-1} + i_t * \tilde{C}_t" />
                    <BlockMath math="h_t = o_t * \tanh(C_t)" />
                    where <InlineMath math="\sigma" /> is the sigmoid function, <InlineMath math="W_f, W_i, W_o, W_C" /> are the weights, <InlineMath math="b_f, b_i, b_o, b_C" /> are the 
                    biases, <InlineMath math="C_t" /> is the cell state at time <InlineMath math="t" />, and <InlineMath math="h_t" /> is the hidden state. Note, that thee matrices <InlineMath math="W_f, W_i, W_o, W_C" /> 
                    are usually separated between the the two sets of values (inputs and hidden states) so that <InlineMath math="f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f)" /> would be equivalent to 
                    <InlineMath math="f_t = \sigma(U_f \cdot h_{t-1} +  V_f \cdot x_t + b_f)" />; for simplicity however, we'll keep the compact form. 

                    <figure className="flex-container-caption">
                        <div className="flex-container"><img src={lstm} alt="Broken" className="image-medium"/></div>
                        <figcaption>An overview of the LSTM architecture. The cell state and hidden state are updated as you move through the sequence.; <a href="https://blog.mlreview.com/understanding-lstm-and-its-diagrams-37e2f46f1714" target="_blank" rel="noopener noreferrer">image source</a>.</figcaption>
                        </figure>
                    
                    <br/>
                    Let's do a step by step example to see 
                    how exactly an LSTM would work with all its gates and what not:

                    <ol className="step-list">
                        <li>
                            <strong>Forget Gate:</strong> This gate decides which information is discarded from the cell state. It takes the previous hidden state <InlineMath math="h_{t-1}" /> and the current input <InlineMath math="x_t" /> and applies a sigmoid function:
                            <BlockMath math="f_t = \sigma(W_f \cdot [h_{t-1}, x_t] + b_f)" />
                            Here, <InlineMath math="W_f" /> and <InlineMath math="b_f" /> are the weights and bias for the forget gate, respectively.
                        </li>
                        <br/>
                        <li> 
                            <strong>Input Gate and Candidate Cell State:</strong> The input gate decides which new information is stored in the cell state, while the candidate cell state creates a vector of new candidate values:
                            <BlockMath math="i_t = \sigma(W_i \cdot [h_{t-1}, x_t] + b_i)" />
                            <BlockMath math="\tilde{C}_t = \tanh(W_C \cdot [h_{t-1}, x_t] + b_C)" />
                            <InlineMath math="W_i" />, <InlineMath math="b_i" />, <InlineMath math="W_C" />, and <InlineMath math="b_C" /> are the weights and biases for the input gate and candidate cell state, respectively.
                        </li>
                        <br/>
                        <li>
                            <strong>Cell State Update:</strong> The cell state is updated by forgetting the old information (as decided by the forget gate) and adding new candidate values (filtered by the input gate):
                            <BlockMath math="C_t = f_t \ast C_{t-1} + i_t \ast \tilde{C}_t" />
                            This equation combines the previous cell state <InlineMath math="C_{t-1}" /> with the new candidate cell state <InlineMath math="\tilde{C}_t" /> modulated by the forget
                             gate <InlineMath math="f_t" /> and input gate <InlineMath math="i_t" /> respectively. The * here is element wise multiplication.
                        </li>
                        <br/>
                        <li>
                            <strong>Output Gate and Hidden State Update:</strong> The output gate decides which parts of the cell state are output, and this filtered version is used to update the hidden state:
                            <BlockMath math="o_t = \sigma(W_o \cdot [h_{t-1}, x_t] + b_o)" />
                            <BlockMath math="h_t = o_t \ast \tanh(C_t)" />
                            Here, <InlineMath math="W_o" /> and <InlineMath math="b_o" /> are the weights and bias for the output gate. The new hidden state <InlineMath math="h_t" /> is computed as the output gate's output <InlineMath math="o_t" /> modulating the cell state <InlineMath math="C_t" />, passed through a <InlineMath math="\tanh" /> function to scale it between -1 and 1.
                        </li>
                    </ol>
                </p>

                <p className="subsubsection-paragraph">
                    Through these steps, the LSTM cell effectively manages its memory, deciding what to remember and what to forget, which allows it to capture long-term dependencies in sequential data.
                    As an example, you could consider a sentence like "The mouse ran really far away from the car as it escaped from a cat". In this sentence, let's say you were at the second word, "mouse";
                    at this word, you would want the cell state to remember that the subject is "mouse" so this is something that the input gate would try to account for but, the word "mouse" may not 
                    always be super relevant at any particular step in the sequence (for example, "far away" doesn't really require us to know what was running away) so the output gate will account for that.
                    The forget gate may be used when something is mentioned multiple times; it would be a way to remove redundancies. Through this process, we can continually "remember" or "forget" 
                    information as it becomes relevant (or irrelevant)!
                </p>


                <h4>Hyperparameters</h4>
                    <p className="subsubsection-paragraph">
                    <ul>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Number of LSTM Units:</strong> Determines the dimensionality of the hidden state and cell state in each LSTM cell. More units can allow the model to capture 
                            more complex information but may increase the risk of overfitting and computational cost.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Number of LSTM Layers:</strong> Stacking multiple LSTM layers can help the model learn higher-level temporal representations. However, deeper networks are more
                             challenging to train and require more data and computational resources.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Learning Rate:</strong> Controls the size of the updates to the model's weights during training. Too large a learning rate can cause the model to converge too 
                            quickly to a suboptimal solution, while too small a rate can slow down the training process.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Batch Size:</strong> Refers to the number of training samples used in one iteration of model training. Smaller batch sizes can provide a regularizing effect
                             and lower generalization error but may increase training time.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Sequence Length:</strong> The length of the input sequences processed by the LSTM. Longer sequences can provide more context but increase computational load
                             and may introduce challenges in learning dependencies due to vanishing gradients.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Dropout:</strong> A regularization technique where randomly selected neurons are ignored during training, which helps prevent overfitting. Dropout can be 
                            applied to the inputs and/or the outputs of the LSTM cells.
                            </p>
                        </li>
                        <li>
                            <p className="subsubsection-paragraph">
                            <strong>Gradient Clipping:</strong> A technique to prevent exploding gradients by setting a threshold value. If the gradients exceed this value, they are scaled down 
                            to keep them within a manageable range.
                            </p>
                        </li>
                    </ul>
                    </p>


                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Implementing an LSTM network in Python is straightforward with frameworks like TensorFlow. Here's an example of using an LSTM for sequence classification:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np

# Generate synthetic data for sequence classification
X_train, y_train = np.random.random((1000, 10, 1)), np.random.randint(2, size=(1000, 1))
X_test, y_test = np.random.random((200, 10, 1)), np.random.randint(2, size=(200, 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='tanh', input_shape=(10, 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32)

# Evaluate the model
model.evaluate(X_test, y_test)`}
                        </SyntaxHighlighter>
                    </p>
                </section>




                <section id="gru" className="code-cleaned">
                <h2>Gated Recurrent Units (GRUs)</h2>
                <p className="subsubsection-paragraph">
                    Gated Recurrent Units (GRUs) are an advanced variation of the standard recurrent neural network. They are designed to adaptively capture dependencies of different time scales 
                    in sequence data. GRUs address the vanishing gradient problem in traditional RNNs and offer a more complex and capable architecture for processing sequential data, particularly 
                    in long sequences. They are a simplification of LSTMs in a sense because they reduce the number of "gates". GRUs, introduced by Cho et al. in 2014, modify the traditional RNN architecture by incorporating gating units. 
                    These gates effectively regulate the flow of information within 
                    the unit, balancing between the memory (past information) and the current input. GRUs have been shown to perform exceptionally well on tasks requiring the modeling of 
                    long-distance temporal relationships, and they are computationally more efficient than LSTMs.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Natural language processing tasks</span>,
                                    <span style={{ color: '#008000' }}> Sequence learning</span>,
                                    <span style={{ color: '#ff4500' }}> Speech recognition</span>,
                                    <span style={{ color: '#1e90ff' }}> Language translation</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.GRU)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.GRU)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Similar to LSTMs, typically <span>O(t*n^2)</span>, where <i>t</i> is the length of the input sequence and <i>n</i> is the number of hidden units; GRUs are often more efficient than LSTMs due to having fewer parameters
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"</span> by Cho et al., 2014; introduced the GRU as an alternative to the LSTM
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>



                <h4>GRU Foundations</h4>
                <p className="subsubsection-paragraph">
                    The architecture of a GRU is characterized by two gates: the reset gate and the update gate. These gates determine how much of the past information needs to be passed along 
                    to the future. The equations governing these gates are:
                    <BlockMath math="r_t = \sigma(W_r \cdot [h_{t-1}, x_t] + b_r)" />
                    <BlockMath math="z_t = \sigma(W_z \cdot [h_{t-1}, x_t] + b_z)" />
                     Let's consider
                    a step by step example of how an update to the hidden state would occur:

                    <ol className="step-list">
                        <li>
                            <strong>Update Gate:</strong> Determines the balance between the previous state and new information.
                            <BlockMath math="z_t = \sigma(W_z \cdot [h_{t-1}, x_t] + b_z)" />
                            The update gate <InlineMath math="z_t" /> uses the previous hidden state <InlineMath math="h_{t-1}" /> and the current input <InlineMath math="x_t" />, 
                            modulated by its weights <InlineMath math="W_z" /> and bias <InlineMath math="b_z" />.
                        </li>
                        <br/>
                        <li>
                            <strong>Reset Gate:</strong> Decides how much past information to forget.
                            <BlockMath math="r_t = \sigma(W_r \cdot [h_{t-1}, x_t] + b_r)" />
                            The reset gate <InlineMath math="r_t" /> influences the degree to which previous hidden state <InlineMath math="h_{t-1}" /> affects the memory content, 
                            using weights <InlineMath math="W_r" /> and bias <InlineMath math="b_r" />.
                        </li>
                        <br/>
                        <li>
                            <strong>Candidate Hidden State:</strong> A blend of new input and past information, modulated by the reset gate.
                            <BlockMath math="\tilde{h}_t = \tanh(W \cdot [r_t \ast h_{t-1}, x_t] + b)" />
                            The candidate hidden state <InlineMath math="\tilde{h}_t" /> is computed considering the reset gate's output, with its own 
                            weights <InlineMath math="W" /> and bias <InlineMath math="b" />.
                        </li>
                        <br/>
                        <li>
                            <strong>Hidden State Update:</strong> The final step updates the hidden state by integrating the old state with the candidate state, governed by the update gate.
                            <BlockMath math="h_t = z_t \ast h_{t-1} + (1 - z_t) \ast \tilde{h}_t" />
                            This equation ensures that the hidden state <InlineMath math="h_t" /> at time <InlineMath math="t" /> is a mixture of the previous 
                            state <InlineMath math="h_{t-1}" /> and the candidate state <InlineMath math="\tilde{h}_t" />, as determined by the update gate <InlineMath math="z_t" />.
                        </li>
                    </ol>

                    Essentially, the reset gate is what the current hidden state would be if we were concerned with what is happening at the current step in sequence; it would minimize 
                    the impact of previous steps in continuum that aren't particularly relevant to the current input however, some of that irrelevant information may still be of value 
                    in other parts of the sequence so the update gate comes in to preserve that information (even if it may not be particularly relevant right now). 
                </p>

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                        <p className="subsubsection-paragraph">
                            <strong>Update and Reset Gates:</strong> The configuration and tuning of the update and reset gates in GRUs are crucial. While not directly exposed as hyperparameters, 
                            the way these gates are trained (through their respective weight matrices and biases) significantly impacts the GRU's ability to model temporal dependencies and manage 
                            the flow of information. Optimizing these aspects involves careful initialization and regularization of the gate weights.
                        </p>
                    </li>
                    <li>
                        <p className="subsubsection-paragraph">
                            <strong>GRU Variants:</strong> There are several variations of the standard GRU model, such as the minimal gated unit (MGU). Each variant adjusts the gate mechanisms
                             and internal operations, potentially offering computational advantages or better performance on specific tasks. Choosing the right variant involves understanding the 
                             trade-offs between complexity, computational efficiency, and task suitability.
                        </p>
                    </li>
                    <li>
                        <p className="subsubsection-paragraph">
                            <strong>Gate Activation Functions:</strong> While sigmoid and tanh are standard choices for gate and candidate state activations in GRUs, experimenting with alternative
                             activation functions like ReLU or Leaky ReLU for the gates or candidate update can influence training dynamics and model performance. The choice of activation functions 
                             can affect the gradient flow and the model's ability to capture long-term dependencies.
                        </p>
                    </li>
                    <li>
                        <p className="subsubsection-paragraph">
                            <strong>Temporal Resolution:</strong> The granularity of the input data and the corresponding sequence length can impact GRU performance. In some cases, adjusting the
                             temporal resolution of the data (e.g., aggregating time steps) can make it easier for the GRU to capture relevant patterns and reduce computational complexity.
                        </p>
                    </li>
                </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Implementing GRUs in Python can be done using deep learning libraries like TensorFlow. Here's an example of using GRUs for a time series prediction task:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
import numpy as np

# Generate synthetic time series data
X, y = np.random.random((1000, 20, 1)), np.random.randint(2, size=(1000, 1))

# Split the data
X_train, y_train = X[:800], y[:800]
X_test, y_test = X[800:], y[800:]

# Define the GRU model
model = Sequential()
model.add(GRU(50, activation='tanh', input_shape=(20, 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
model.evaluate(X_test, y_test)`}
                        </SyntaxHighlighter>
                    </p>
                </section>


                <section id="bi" className="code-cleaned">
                <h2>Bi-Directional RNNs</h2>
                <p className="subsubsection-paragraph">
                    Bi-directional Recurrent Neural Networks (Bi-RNNs) extend the traditional RNN architecture to enhance the model's understanding of the context in sequence data. By 
                    processing data in both forward and backward directions, Bi-RNNs capture information that may be overlooked by unidirectional RNNs.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Text classification</span>,
                                    <span style={{ color: '#008000' }}> Sentiment analysis</span>,
                                    <span style={{ color: '#ff4500' }}> Language translation</span>,
                                    <span style={{ color: '#1e90ff' }}> Speech recognition</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.Bidirectional)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.utils.rnn.bidirectional_dynamic_rnn for dynamic Bi-RNNs)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Typically <span>O(2*t*n^2)</span>, where <i>t</i> is the length of the input sequence and <i>n</i> is the number of hidden units; the factor of 2 accounts for the forward and backward passes
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Bidirectional RNNs are more of a technique than a model introduced by a single paper, but the concept is widely applied in the literature on sequence modeling tasks
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>Bi-Directional RNN Foundations</h4>
                <p className="subsubsection-paragraph">
                    A Bi-RNN consists of two RNN layers that run in parallel: one processes the input sequence from start to end, while the other processes it from end to start. The outputs of
                     these two layers are typically concatenated at each time step, providing a comprehensive view of the sequence from both directions. This dual processing allows the network 
                     to capture dependencies that might be missed when the sequence is processed in only one direction. Mathematically, the forward hidden 
                     state <InlineMath math="\overrightarrow{h_t}" /> and
                      the backward hidden state <InlineMath math="\overleftarrow{h_t}" /> at time <InlineMath math="t" /> can be represented as:
                    <BlockMath math="\overrightarrow{h_t} = \sigma(W_{\overrightarrow{h}}x_t + U_{\overrightarrow{h}}\overrightarrow{h_{t-1}} + b_{\overrightarrow{h}})" />
                    <BlockMath math="\overleftarrow{h_t} = \sigma(W_{\overleftarrow{h}}x_t + U_{\overleftarrow{h}}\overleftarrow{h_{t+1}} + b_{\overleftarrow{h}})" />
                    The final output at time <InlineMath math="t" /> is a combination of both hidden states: <InlineMath math="y_t = f(\overrightarrow{h_t}, \overleftarrow{h_t})" />.
                </p>

                <div className="subsubsection-paragraph">
                <p>Let's explore a Bidirectional RNN processing the sequence "A B" over two time steps:</p>

                <h4>Time Step 1 (t = 1)</h4>
                <ol className="step-list">
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Input Encoding:</strong> The input "A" at time step 1 is encoded as <InlineMath math="x_1" />.
                    </p>
                    </li>
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Forward Pass:</strong> The forward RNN computes its hidden state <InlineMath math="\overrightarrow{h}_1" /> based on <InlineMath math="x_1" /> and its initial hidden
                         state <InlineMath math="\overrightarrow{h}_0" /> (assumed to be a zero vector for simplicity):
                        <BlockMath math="\overrightarrow{h}_1 = \tanh(\overrightarrow{W}_{xh} \cdot x_1 + \overrightarrow{W}_{hh} \cdot \overrightarrow{h}_0 + \overrightarrow{b}_h)" />
                    </p>
                    </li>
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Backward Pass Initialization:</strong> Since this is the first step, the backward RNN hasn't processed "B" yet. It will start its computation in the next step, moving
                         backward from the end of the sequence.
                    </p>
                    </li>
                </ol>

                <h4>Time Step 2 (t = 2)</h4>
                <ol className="step-list">
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Input Encoding:</strong> The input "B" at time step 2 is encoded as <InlineMath math="x_2" />.
                    </p>
                    </li>
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Forward Pass Continuation:</strong> The forward RNN updates its hidden state <InlineMath math="\overrightarrow{h}_2" /> based on <InlineMath math="x_2" /> and <InlineMath math="\overrightarrow{h}_1" />:
                        <BlockMath math="\overrightarrow{h}_2 = \tanh(\overrightarrow{W}_{xh} \cdot x_2 + \overrightarrow{W}_{hh} \cdot \overrightarrow{h}_1 + \overrightarrow{b}_h)" />
                    </p>
                    </li>
                    <li>
                    <p className="subsubsection-paragraph">
                        <strong>Backward Pass Start:</strong> The backward RNN processes "B" as its first step (since it moves from end to start). It computes its hidden 
                        state <InlineMath math="\overleftarrow{h}_2" /> based on <InlineMath math="x_2" /> and its initial state <InlineMath math="\overleftarrow{h}_3" /> (assuming the sequence 
                        length is 2, so the next state is initialized to zero):
                        <BlockMath math="\overleftarrow{h}_2 = \tanh(\overleftarrow{W}_{xh} \cdot x_2 + \overleftarrow{W}_{hh} \cdot \overleftarrow{h}_3 + \overleftarrow{b}_h)" />
                    </p>
                    </li>
                </ol>

                <p className="subsubsection-paragraph">
                    After processing both time steps, the BiRNN has computed <InlineMath math="\overrightarrow{h}_1" />, <InlineMath math="\overrightarrow{h}_2" />, 
                    and <InlineMath math="\overleftarrow{h}_2" />. The backward RNN's hidden state for the first time step (<InlineMath math="\overleftarrow{h}_1" />) will be computed in the next
                     backward pass, reflecting the sequence's end-to-start processing nature.
                </p>

                <p className="subsubsection-paragraph">
                    <strong>Concatenating Hidden States:</strong> At each time step, the forward and backward hidden states can be concatenated to form a comprehensive representation that captures
                     both past and future context:
                    <ul>
                    <li>At <InlineMath math="t=1" />, the combined state is <InlineMath math="[ \overrightarrow{h}_1; \overleftarrow{h}_1 ]" /> (Note: <InlineMath math="\overleftarrow{h}_1" /> is computed in the next backward pass).</li>
                    <li>At <InlineMath math="t=2" />, the combined state is <InlineMath math="[ \overrightarrow{h}_2; \overleftarrow{h}_2 ]" />.</li>
                    </ul>
                    These combined states can be used for further processing or as input to a subsequent layer, such as a fully connected layer for classification or regression tasks.
                </p>
                </div>


                <h4>Hyperparameters</h4>
                    <p className="subsubsection-paragraph">
                        <ul>
                            <li>
                                <p className="subsubsection-paragraph">
                                    <strong>Type of RNN Cells:</strong> The choice between basic RNN cells, LSTMs, and GRUs for the forward and backward networks. LSTM and GRU cells are generally preferred for their ability to capture long-term dependencies and mitigate vanishing gradient issues.
                                </p>
                            </li>
                            <li>
                                <p className="subsubsection-paragraph">
                                    <strong>Combination Method:</strong> The strategy for integrating the outputs of the forward and backward layers at each time step. Common approaches include concatenation (which preserves all information but doubles the feature size), summing, and averaging. The choice can affect the subsequent layer's input size and how the information from both directions is utilized.
                                </p>
                            </li>
                        </ul>
                    </p>


                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Implementing a Bi-RNN in Python is straightforward with deep learning frameworks like TensorFlow. Here's an example using TensorFlow for sequence classification:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
import numpy as np

# Generate synthetic data for sequence classification
X, y = np.random.random((1000, 20, 1)), np.random.randint(2, size=(1000, 1))

# Split the data
X_train, y_train = X[:800], y[:800]
X_test, y_test = X[800:], y[800:]

# Define the Bi-RNN model
model = Sequential()
model.add(Bidirectional(LSTM(50, activation='tanh'), input_shape=(20, 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
model.evaluate(X_test, y_test)`}
                        </SyntaxHighlighter>
                    </p>
                </section>

                
                
                <div className="subsubsection-navigation">
                    <Link to="/ml/cnn">← Convolutional Neural Networks</Link>
                    <Link to="/ml/seq2seq">Seq2Seq →</Link>
                </div>
            </main>
            
            <Footer />
        </div>
    );
}

export default RNN;
