import React from 'react';
import '../../styles/subsection.css';
import Header from '../../components/Header';
import Footer from '../../components/Footer';
import { Link } from 'react-router-dom';
import 'katex/dist/katex.min.css';
import { InlineMath, BlockMath } from 'react-katex';
import { LightAsync as SyntaxHighlighter } from 'react-syntax-highlighter';
import { docco } from 'react-syntax-highlighter/dist/esm/styles/hljs';


function CNN() {
    return (
        <div className="subsubsection-container">
            <Header />
            <div class="side-nav-container">
                <aside className="subsubsection-side-nav">
                    <a href="#found">Fundamentals</a>
                    <a href="#capsule">Capsule Networks</a>
                    <a href="#adv">Variants</a>
                </aside>
            </div>
            
            <main className="subsubsection-content">
                <div className="titles"><h1>Convolutional Neural Networks</h1></div>

                <section id="found" className="code-cleaned">
                <h2>Fundamentals of CNNs</h2>
                <p className="subsubsection-paragraph">
                    Convolutional Neural Networks (CNNs) are a class of deep neural networks, most commonly applied to analyzing visual imagery. They are particularly adept at processing data 
                    with a grid-like topology, such as images.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Sentence classification</span>,
                                    <span style={{ color: '#008000' }}> Sentiment analysis</span>,
                                    <span style={{ color: '#ff4500' }}> Topic categorization</span>,
                                    <span style={{ color: '#1e90ff' }}> Document summarization</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.Conv1D for 1D convolutions)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.Conv1d for 1D convolutions)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>Varies based on network architecture, typically O(n*m*d)</span>, where <i>n</i> is the size of the input, <i>m</i> is the filter size, and <i>d</i> is the depth of the network
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Convolutional Neural Networks for Sentence Classification"</span> by Yoon Kim, 2014
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>Core Components</h4>
                    <p className="subsubsection-paragraph">
                        A CNN typically consists of a sequence of layers, each designed to process and transform the input data to extract and learn high-level features. These layers include:
                        <ol>
                        <li><strong>Convolutional Layers:</strong> Apply a set of learnable filters to the input. Each filter captures specific features, and its application results in feature maps.</li>
                        <li><strong>Activation Functions:</strong> Introduce non-linearities into the model, enabling it to learn complex patterns. The ReLU function is commonly used.</li>
                        <li><strong>Pooling Layers:</strong> Reduce the spatial dimensions (width and height) of the input volume for the next convolutional layer, decreasing the computational complexity and the
                         number of parameters.</li>
                        <li><strong>Fully Connected Layers:</strong> These layers connect every neuron in one layer to every neuron in the next layer, and are typically placed at the end of CNN architectures to
                         perform classification based on the features extracted by the convolutional layers.</li>
                        </ol>
                    </p>

                    <h4>Convolution Operation</h4>
                    <p className="subsubsection-paragraph">
                        The convolution operation is the core building block of a CNN. It involves sliding a set of learnable filters or kernels over the input image and computing the dot product
                         between the filter values and the input values at each position:
                        <BlockMath math="\text{If } K \text{ is a filter of size } m \times n, \text{ and } I \text{ is the input, then the convolution }" />
                         <InlineMath math="C \text{ at position } (x, y) \text{ is given by:}" />
                        <BlockMath math="C(x, y) = \sum_{i=1}^{m} \sum_{j=1}^{n} F(i, j) \times I(x+i-1, y+j-1)" />
                        This operation is repeated for every position in the input image, producing a feature map for each filter. These operations can be done at different dimensionalities; for example,
                        a 1D convolution, often used in time-series analysis and NLP, involves a one-dimensional kernel moving across a single dimension of data. It works well with data that has a temporal 
                or sequential nature. The convolution operation in 1D is expressed as:
                <BlockMath math="S(t) = (K * I)(t) = \sum_{a=-\infty}^{\infty} K(a) \cdot I(t - a)" />
                where <InlineMath math="S(t)" /> is the output signal, <InlineMath math="K" /> is the kernel, and <InlineMath math="I(t)" /> is the input signal at time <InlineMath math="t" />. We let * represent 
                a convolution operation.
                    </p>

                    <h4>CNN Forward Pass</h4>
                    <p className="subsubsection-paragraph">
                        After the convolution operation, an activation function like ReLU (Rectified Linear Unit) is applied to introduce non-linearity, allowing the network to learn complex patterns:
                        <BlockMath math="\text{ReLU}(x) = \max(0, x)" />
                        This operation is applied element-wise to the output of the convolution operation.
                    </p>

                    <p className="subsubsection-paragraph">
                        Pooling layers reduce the dimensionality of each feature map while retaining the most important information. Max pooling, one common approach, selects the maximum element from the region of
                         the feature map covered by the pooling window:
                        <BlockMath math="\text{MaxPooling}(K) = \max_{(x, y) \in W} K(x, y)" />
                        where <InlineMath math="W" /> is the region covered by the pooling window in the feature map <InlineMath math="K" />.
                    </p>

                    <p className="subsubsection-paragraph">
                        The high-level reasoning in the neural network is done by the fully connected layers. After several convolutional and pooling layers, the high-level features are flattened into a vector and fed into fully connected layers for classification:
                        <BlockMath math="\text{Output} = \text{Softmax}(W \times \text{Flattened Features} + b)" />
                        where <InlineMath math="W" /> represents the weights and <InlineMath math="b" /> the bias of the fully connected layer.
                    </p>

                    <p className="subsubsection-paragraph">
                        Here is an example worked out for an NLP problem:
                        <ol className="step-list">

                        <li><strong>Step 1: Preprocessing and Embedding:</strong> Our input sentence is "Great movie, loved it!". After tokenization and removing punctuation, we get the tokens ["Great", "movie", "loved", "it"]. Each token is then transformed into an embedding vector. Assume we use 4-dimensional embeddings for simplicity, obtained from a pre-trained model or initialized randomly for training:
                            <ul>
                            <li><InlineMath math="\text{Embedding}(\text{'Great'}) = [0.5, 0.8, 0.1, 0.6]" /></li>
                            <li><InlineMath math="\text{Embedding}(\text{'movie'}) = [0.6, 0.7, 0.2, 0.8]" /></li>
                            <li><InlineMath math="\text{Embedding}(\text{'loved'}) = [0.4, 0.9, 0.3, 0.5]" /></li>
                            <li><InlineMath math="\text{Embedding}(\text{'it'}) = [0.7, 0.6, 0.4, 0.7]" /></li>
                            </ul>
                            These embeddings form our input matrix <InlineMath math="X" /> of size <InlineMath math="4 \times 4" /> (4 words, each represented by a 4-dimensional vector). </li>
                        <br/>

                        <li><strong>Step 2: Convolution Layer: </strong> We apply two convolutional filters across the sequence, each designed to capture different linguistic features. Each filter 
                        spans 3 words and the full embedding dimension, making their size <InlineMath math="3 \times 4" />. For illustration, let's define two filters with arbitrary values:
                            <BlockMath math="\text{Filter 1 (F1)} = \begin{bmatrix} 0.1 & -0.1 & 0.2 & -0.2 \\\\ 0.2 & 0.1 & -0.2 & -0.1 \\\\ -0.1 & 0.2 & 0.1 & -0.2 \end{bmatrix}" />
                            <BlockMath math="\text{Filter 2 (F2)} = \begin{bmatrix} -0.2 & 0.1 & -0.1 & 0.2 \\\\ 0.1 & -0.2 & 0.2 & 0.1 \\\\ 0.2 & -0.1 & 0.1 & -0.2 \end{bmatrix}" />
                            Each filter is applied to the input matrix <InlineMath math="X" />, sliding over the words to compute the convolution operation for each position where the filter fits. 
                            The output is a feature map for each filter, capturing detected features across the sentence.
                            The first application of F1 covers "Great", "movie", and "loved". The calculation is:
                            <BlockMath math="\text{F1 Output at Step 1} = \text{F1} \cdot \text{Segment}_{\text{Great, movie, loved}}" />
                            <BlockMath math="= \begin{bmatrix} 0.1 & -0.1 & 0.2 & -0.2 \\\\ 0.2 & 0.1 & -0.2 & -0.1 \\\\ -0.1 & 0.2 & 0.1 & -0.2 \end{bmatrix} \cdot \begin{bmatrix} 0.5 \\\\ 0.8 \\\\ 0.1 \\\\ 0.6 \end{bmatrix} \begin{bmatrix} 0.6 \\\\ 0.7 \\\\ 0.2 \\\\ 0.8 \end{bmatrix} \begin{bmatrix} 0.4 \\\\ 0.9 \\\\ 0.3 \\\\ 0.5 \end{bmatrix}" />
                            <BlockMath math="= \begin{bmatrix} (0.1 \times 0.5 + -0.1 \times 0.8 + 0.2 \times 0.1 + -0.2 \times 0.6) + \\(0.2 \times 0.6 + 0.1 \times 0.7 + -0.2 \times 0.2 + -0.1 \times 0.8) + \\(-0.1 \times 0.4 + 0.2 \times 0.9 + 0.1 \times 0.3 + -0.2 \times 0.5) \end{bmatrix}" />
                            <BlockMath math="= \begin{bmatrix} -0.07 + 0.08 + 0.11 \end{bmatrix}" />
                            <BlockMath math="= 0.12" />

                            We will leave the rest to the reader to calculate but essentially, you would just slide this over so that you would look at "movie, love it!" and compute the convolution; 
                            that value is then added to the vector already contained 0.12. This is passed on to the next layer. 
                            </li>
                        <br/>

                        <li> <strong>Step 3: Activation Function: </strong>After convolution, we apply a ReLU activation function to introduce non-linearity and aid in feature detection:
                            <BlockMath math="\text{ReLU}(x) = \max(0, x)" />
                            The ReLU function is applied element-wise to each value in the feature maps generated by the filters, ensuring that only positive features are retained and negative 
                            values are set to zero.</li>
                        <br/>

                        <li><strong>Step 4: Pooling Layer: </strong> To reduce dimensionality and extract the most significant features, we apply a max pooling operation over the feature maps. 
                        Assuming we use a pooling size of 2, this operation selects the maximum value from each pair of adjacent values in the feature maps, effectively halving their size.</li>
                        <br/>

                        <li><strong>Step 5: Fully Connected Layer and Output: </strong>
                        The output from the pooling layer is flattened into a single vector and passed through a fully connected (dense) layer, which integrates the features to make a final sentiment classification. This layer uses a softmax activation function for multi-class classification or a sigmoid function for binary classification (positive or negative sentiment):
                            <BlockMath math="\text{Softmax}(z_i) = \frac{e^{z_i}}{\sum_{j} e^{z_j}}" />
                            <BlockMath math="\text{Sigmoid}(z) = \frac{1}{1 + e^{-z}}" />
                            The final output is the probability of the sentence expressing positive sentiment, which is used to classify the sentiment of the input sentence.
                        </li>

                        </ol>
                    </p>
            
                    <h4>CNN Backward Pass</h4>
            <p className="subsubsection-paragraph">
            The backward pass in a CNN involves computing gradients with respect to the network's weights and using these gradients to update the weights. This process is essential for learning.
             Let's focus on updating a single weight, <InlineMath math="w" />, in a convolutional filter.
             <ol>
                <li>
                <p>
                <strong>Forward Pass Summary: </strong>Consider a convolutional layer applying a filter with weight <InlineMath math="w" /> to an input feature map, producing an output feature map. For 
                    simplicity, let's consider a single input and output channel, and ignore biases.
                </p>
                </li>

                <li>
                <p>
                <strong>Computing the Loss: </strong>After the forward pass through the entire network, a loss function <InlineMath math="L" /> quantifies the difference between the predicted and true values. 
                    The goal of the backward pass is to minimize this loss.
                </p>
                </li>

                <li>
                <p>
                <strong>Gradients of the Loss: </strong>The gradient of the loss <InlineMath math="L" /> with respect to the output feature map <InlineMath math="O" /> of the convolutional layer is computed during the backward 
                    pass, starting from the final layer and propagated back through the network.
                </p>
                </li>

                <li>
                <p>
                <strong>Gradient with Respect to the Weight: </strong>To update weight <InlineMath math="w" />, we need to compute <InlineMath math="\frac{\partial L}{\partial w}" />, the gradient of the loss with respect 
                    to <InlineMath math="w" />. This involves applying the chain rule:
                    <BlockMath math="\frac{\partial L}{\partial w} = \sum \left( \frac{\partial L}{\partial O} \cdot \frac{\partial O}{\partial w} \right)" />
                    Here, <InlineMath math="\frac{\partial L}{\partial O}" /> is the gradient of the loss with respect to the output feature map,
                     and <InlineMath math="\frac{\partial O}{\partial w}" /> is the gradient of the output feature map with respect to the weight <InlineMath math="w" />, which can be computed 
                     based on the input feature map.
                </p>
                </li>

                <li>
                <p>
                <strong>Computing <InlineMath math="\frac{\partial O}{\partial w}" />: </strong>The term <InlineMath math="\frac{\partial O}{\partial w}" /> represents how changes in weight <InlineMath math="w" /> affect the output feature map. For a given position in 
                    the input feature map <InlineMath math="I" /> where <InlineMath math="w" /> is applied, this term is simply the value at that position in <InlineMath math="I" />, since the 
                    convolution operation involves multiplying <InlineMath math="w" /> by the input and summing up the results.
                </p>
                </li>

                <li>
                
                <p>
                <strong>Weight Update: </strong>Once <InlineMath math="\frac{\partial L}{\partial w}" /> is computed, weight <InlineMath math="w" /> is updated using gradient descent:
                    <BlockMath math="w = w - \alpha \frac{\partial L}{\partial w}" />
                    where <InlineMath math="\alpha" /> is the learning rate, a small positive value that controls the size of the weight update.
                </p>
                </li>
            </ol>

            This process is repeated for all weights in the convolutional filter and across all filters in the layer, and for all layers in the network, iteratively adjusting the weights to 
            minimize the loss and improve the network's performance.
            </p>

            <h4>Multi-Channel</h4>
            <p className="subsubsection-paragraph">
                In a multi-channel CNN, input data and kernels have multiple channels (e.g., RGB color channels in images). Each channel processes the data independently, and the results are
                 summed to form a single output feature map. This allows the network to learn from multiple perspectives of the input data simultaneously. The operation over multi-channel data 
                 can be described as:
                <BlockMath math="\sum_{c=1}^{C} (K_c * I_c)" />
                where <InlineMath math="C" /> is the number of channels, <InlineMath math="K_c" /> is the kernel for channel <InlineMath math="c" />, and <InlineMath math="I_c" /> is the input
                 for channel <InlineMath math="c" />. Honestly, this is more relevant for images but it could be used in NLP as well, say for example, if you had multiple sentences attached
                 to a single output; then, you could use different channels for the different sentences!
            </p>

            <h4>Hyperparameters</h4>
            <p className="subsubsection-paragraph">
            <ul>
                    <li>
                    <p>
                        <strong>Number of Filters: </strong>Determines the number of filters used in convolutional layers. Each filter extracts different features from the input. A higher number increases the model's capacity 
                        but also its computational complexity and risk of overfitting.
                    </p>
                    </li>

                    <li>
                    <p>
                    <strong>Size of Filters (Kernel Size): </strong>Defines the dimensions of the filters in convolutional layers. Common sizes include <InlineMath math="3 \times 3" />, <InlineMath math="5 \times 5" />, 
                        and <InlineMath math="7 \times 7" />. Smaller filters capture local features, while larger filters capture more global features.
                    </p>
                    </li>

                    <li>
        
                    <p>
                    <strong>Stride: </strong>The stride specifies how many pixels the filter moves across the input matrix after each operation. A stride of 1 moves the filter one pixel at a time, while a larger 
                        stride results in downsampling the feature map.
                    </p>
                    </li>

                    <li>
                    <p>
                    <strong>Padding: </strong>Padding adds zeros (zero-padding) or replicates edge values (reflect-padding) around the input matrix to allow convolutional filters to be applied at the edges. Padding 
                        can be 'valid' (no padding), 'same' (padding is added to keep the output size equal to the input size), or a custom value.
                    </p>
                    </li>

                    <li>
   
                    <p>
                    <strong>Pooling Size: </strong>In pooling layers, this parameter defines the size of the pooling window. Common choices are <InlineMath math="2 \times 2" /> or <InlineMath math="3 \times 3" />. Pooling
                         reduces the spatial dimensions (height and width) of the feature maps.
                    </p>
                    </li>

                    <li>

                    <p>
                    <strong>Pooling Type: </strong>Determines the pooling operation. Max pooling takes the maximum value within the window, average pooling computes the average, and global pooling reduces each feature map 
                        to a single value by taking the max or average over the entire map.
                    </p>
                    </li>

                    <li>
                    <p>
                    <strong>Dilation Rate: </strong>Specifies the spacing between the kernel elements. Dilation allows the filter to have a larger receptive field, capturing wider context without increasing the number of 
                        parameters or computational cost.
                    </p>
                    </li>
                </ul>
            </p>

            <h4>In Code</h4>
            <p className="subsubsection-paragraph">
                Implementing a CNN architecture in Python can be done using deep learning libraries such as TensorFlow or PyTorch. Here’s an example of defining a simple CNN architecture using 
                TensorFlow:
                <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

# Number of words to consider as features
max_features = 10000
# Cut texts after this number of words (among top max_features most common words)
maxlen = 500

# Load the data (it's already preprocessed)
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=max_features)

# Pad the sequences to the same length
train_data = sequence.pad_sequences(train_data, maxlen=maxlen)
test_data = sequence.pad_sequences(test_data, maxlen=maxlen)

# Define the model
model = models.Sequential()
model.add(layers.Embedding(max_features, 128, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

history = model.fit(train_data, train_labels, epochs=10, batch_size=128, validation_split=0.2)

test_loss, test_acc = model.evaluate(test_data, test_labels)
print(f'Test Accuracy: {test_acc}')`}
                        </SyntaxHighlighter>
                    </p>
                </section>


                <section id="capsule" className="code-cleaned">
                <h2>Capsule Networks</h2>
                <p className="subsubsection-paragraph">
                Capsule Networks (CapsNets) are a type of artificial neural network designed to improve upon some of the limitations of traditional CNNs. 
                They were introduced by Geoffrey Hinton and his colleagues in a paper titled "Dynamic Routing Between Capsules" in 2017. Capsule Networks aim to overcome issues related to CNNs,
                 such as the loss of spatial hierarchy between simple and complex objects in the process of pooling, by preserving the spatial relationships between parts of an object.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Text classification</span>,
                                    <span style={{ color: '#008000' }}> Language modeling</span>,
                                    <span style={{ color: '#ff4500' }}> Sentiment analysis</span>,
                                    <span style={{ color: '#1e90ff' }}> Document understanding</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>No standard libraries; implementation typically requires custom development using TensorFlow or PyTorch</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>Depends on network architecture and dynamic routing algorithm; not well-defined like CNNs or RNNs</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Dynamic Routing Between Capsules"</span> by Sara Sabour, Nicholas Frosst, and Geoffrey E. Hinton, 2017
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>Capsule Network Foundations</h4>
                <p className="subsubsection-paragraph">
                    <ul>
                        <li><strong>Capsules: </strong>A capsule is a group of neurons that represents different properties of the same entity, such as an object or a part of an object. 
                        Each capsule tries to learn to recognize an object or an object part in the image and outputs a vector, where the length of the vector represents the probability of 
                        the object's presence, and the orientation represents the instantiation parameters (position, rotation, scale, etc.).</li>
                        <li><strong>Dynamic Routing: </strong>Instead of pooling layers used in CNNs, Capsule Networks use a dynamic routing algorithm to ensure that the output of one capsule
                         gets sent to an appropriate parent capsule in the next layer. This process helps preserve the spatial hierarchies between features.</li>
                        <li><strong>Robustness to Affine Transformations: </strong>Capsule Networks are designed to be more robust to affine transformations (like rotation, scaling, translation) 
                        than traditional CNNs because they preserve the hierarchical relationships between features.</li>

                    </ul>

                    A Capsule Network typically consists of several layers, including convolutional layers for feature detection, Primary Capsule layers, and Digit Capsule layers.
                     Capsules in a deeper layer make predictions about the output of higher-level capsules, based on their inputs. The agreement between these predictions is measured using 
                     a routing algorithm, such as "dynamic routing," which iteratively adjusts the connection strengths (or "coupling coefficients") between capsules.
                </p>

                <p className="subsubsection-paragraph">
                While Capsule Networks were primarily proposed for image data, their underlying principles can be applied to NLP tasks as well. In NLP, understanding the hierarchical structure of
                 language and the relationships between different parts of a sentence or document is essential. CapsNets can potentially capture these relationships more effectively than traditional
                  models.

                  <ul>
                    <li><strong>Document Classification: </strong>CapsNets can be used to understand the hierarchical structure of documents, where sentences form paragraphs and paragraphs form the document, 
                    preserving the relationships between these components.</li>
                    <li><strong>Sentence Modelling: </strong>They can also be applied to sentence-level tasks, capturing the relationships between different entities in the sentence and their attributes.</li>
                  </ul>

                  However, applying Capsule Networks to NLP is still an area of ongoing research, and there are challenges to overcome, such as defining what constitutes a "part" in the context of
                   language and effectively implementing dynamic routing for sequential data. While promising, CapsNets in NLP haven't yet reached the same level of maturity and widespread adoption
                    as they have in some image processing tasks.
                </p>

                {/* <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Capsule Networks find applications in tasks that require understanding of spatial relationships and pose estimation, such as 3D object detection and fine-grained
                     image classification. Implementing CapsNet in Python requires using deep learning libraries like TensorFlow or PyTorch. Here’s an example of defining a simple Capsule 
                     Network architecture using TensorFlow:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Assume texts is your list of text samples
tokenizer = Tokenizer(num_words=10000)  # Adjust num_words as needed
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=100)  # Adjust maxlen as needed

# Define a simple Capsule Layer
class CapsuleLayer(layers.Layer):
    def __init__(self, num_capsules, dim_capsule, **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsules = num_capsules
        self.dim_capsule = dim_capsule
        # Additional initialization as needed

    def build(self, input_shape):
        # Define weights and biases
        # Implement dynamic routing if needed

    def call(self, inputs):
        # Define the forward pass, routing mechanism
        # Return the output vector of the capsules

# Building the Capsule Network model
def CapsNet(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    x = layers.Embedding(input_dim=10000, output_dim=100, input_length=100)(inputs)  # Adjust these parameters as needed
    primary_caps = CapsuleLayer(num_capsules=32, dim_capsule=8)(x)  # Example parameters
    digit_caps = CapsuleLayer(num_capsules=num_classes, dim_capsule=16)(primary_caps)  # Example parameters
    
    # Define the output layer based on the problem (e.g., classification)
    outputs = layers.Lambda(lambda z: tf.sqrt(tf.reduce_sum(tf.square(z), axis=2)))(digit_caps)  # Length of the capsule vectors
    
    return models.Model(inputs=inputs, outputs=outputs)

model = CapsNet(input_shape=(100,), num_classes=10)  # Adjust num_classes as needed
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
# model.fit(data, labels, epochs=10, batch_size=32)  # labels should be your target variable`}
                        </SyntaxHighlighter> */}
                    {/* </p> */}
                </section>


                <section id="adv" className="code-cleaned">
                <h2>Advancements & Variants</h2>

                <h4>Dilated Convolutions</h4>
                <p className="subsubsection-paragraph">
                    Dilated convolutions, also known as atrous convolutions, allow networks to have a wider field of view without increasing the number of parameters. They incorporate gaps into
                     the convolutional filters, effectively expanding the kernel size without an increase in computational complexity. The dilated convolution operation can be mathematically
                      represented as:
                    <BlockMath math="Y(i) = \sum_{k=1}^{K} X(i + r \cdot k) \cdot W(k)" />
                    where <InlineMath math="Y" /> is the output, <InlineMath math="X" /> is the input, <InlineMath math="W" /> is the filter, <InlineMath math="K" /> is the kernel size, 
                    and <InlineMath math="r" /> is the dilation rate. The term <InlineMath math="r \cdot k" /> introduces gaps into the standard convolutional process.
                </p>

                <h4>Depthwise Separable Convolutions</h4>
                <p className="subsubsection-paragraph">
                    Depthwise separable convolutions, used in architectures like MobileNets, decompose a standard convolution into a depthwise convolution and a pointwise convolution. This
                     reduces the computational cost and the number of parameters. The operation involves first applying a single filter per input channel (depthwise convolution), followed by a
                      1x1 convolution (pointwise convolution) to combine the outputs of the depthwise convolution. The mathematical expression for depthwise separable convolution is:
                    <BlockMath math="Y(i, j) = \sum_{k, l} X(i + k, j + l) \cdot W_{d}(k, l) \quad"/> followed by <BlockMath math="\quad Y'(i, j) = \sum_{m} Y(i, j, m) \cdot W_{p}(m)" />
                    where <InlineMath math="W_{d}" /> and <InlineMath math="W_{p}" /> represent the depthwise and pointwise filters, respectively.
                </p>

                <h4>Transposed Convolutions</h4>
                <p className="subsubsection-paragraph">
                    Transposed convolutions, often used in generative models and upsampling, are the reverse of regular convolutions. They map lower-dimensional feature space to a higher-dimensional 
                    space, effectively 'deconvolving' the input. The transposed convolution can be thought of as distributing a single input value to multiple outputs, scaled by the
                     transposed convolution filter. Mathematically, it can be described as:
                    <BlockMath math="Y(i, j) = \sum_{k, l} X(i - k, j - l) \cdot W^{T}(k, l)" />
                    where <InlineMath math="W^{T}" /> is the transposed convolution filter. This operation increases the spatial dimensions of the input feature map. Here's an example of how some of 
                    these types of convolutions might be implemented: <br/>

                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import tensorflow as tf
from tensorflow.keras import layers, models

# Define a simple CNN model with advanced convolutional layers
model = models.Sequential()

# Adding a dilated convolution layer
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1), dilation_rate=2))

# Adding a depthwise separable convolution layer
model.add(layers.SeparableConv2D(64, (3, 3), activation='relu'))

# Adding a transposed convolution layer
model.add(layers.Conv2DTranspose(64, (3, 3), strides=(2, 2), padding='same', activation='relu'))

# Adding a few more layers to complete the model
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

# Print the model summary
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Example training and evaluation (assuming you have training and test data)
# model.fit(x_train, y_train, batch_size=64, epochs=10)
# model.evaluate(x_test, y_test)
`}
                        </SyntaxHighlighter>
                </p>
                
            </section>

                
                
                <div className="subsubsection-navigation">
                    <Link to="/ml/classic">← Classic Methods</Link>
                    <Link to="/ml/rnn">Recurrent Neural Networks →</Link>
                </div>
            </main>
            
            <Footer />
        </div>
    );
}

export default CNN;
