import React from 'react';
import '../../styles/subsection.css';
import Header from '../../components/Header';
import Footer from '../../components/Footer';
import { Link } from 'react-router-dom';
import 'katex/dist/katex.min.css';
import { InlineMath, BlockMath } from 'react-katex';
import { LightAsync as SyntaxHighlighter } from 'react-syntax-highlighter';
import { docco } from 'react-syntax-highlighter/dist/esm/styles/hljs';

import autoencoder_arch from '../../media/Adv/autoencoder_arch.png';

function AdvancedML() {
    return (
        <div className="subsubsection-container">
            <Header />
            <div class="side-nav-container">
                <aside className="subsubsection-side-nav">
                    <a href="#auto">Autoencoders</a>
                    <a href="#ntm1">Neural Turing Machine</a>
                    <a href="#ntm2">Neural Topic Modelling</a>
                </aside>
            </div>
            
            <main className="subsubsection-content">
                <div className="titles"><h1>Other Architectures</h1></div>

                <section id="auto" className="code-cleaned">
                <h2>Autoencoders</h2>
                <p className="subsubsection-paragraph">
                    Autoencoders (AE) are neural networks designed for unsupervised learning tasks. They aim to learn a compressed, encoded representation of data, typically for dimensionality
                     reduction or feature extraction. Autoencoders find widespread applications in data denoising, generation, and anomaly detection; they can also be applied to NLP tasks.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Feature learning</span>,
                                    <span style={{ color: '#008000' }}> Dimensionality reduction</span>,
                                    <span style={{ color: '#ff4500' }}> Anomaly detection</span>,
                                    <span style={{ color: '#1e90ff' }}> Data generation and denoising</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>TensorFlow (tf.keras.layers.Dense for building autoencoder layers)</span>,
                                    <span style={{ color: '#20b2aa' }}> PyTorch (torch.nn.Linear for defining encoder and decoder layers)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Depends on the specific architecture; for a basic autoencoder, typically <span>O(n*m)</span>, where <i>n</i> is the number of inputs and <i>m</i> is the number of neurons in the hidden layer
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    While there's no single paper that introduced autoencoders, they were popularized by Hinton and others in the context of deep learning and neural networks. A foundational text is "Reducing the Dimensionality of Data with Neural Networks" by Hinton and Salakhutdinov, 2006.
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>AE Foundations</h4>
                <p className="subsubsection-paragraph">
                An autoencoder is a neural network architecture comprised of two main components: the encoder and the decoder. The encoder's function is to map the input data to a lower-dimensional latent
                 space, effectively compressing the data into a more compact representation. Conversely, the decoder's role is to reconstruct the original input data from this compressed latent representation.
                  The training process involves the use of backpropagation and gradient descent algorithms to minimize the reconstruction error, which is quantified as the difference between the original input 
                  and the reconstructed output.
                </p>

                <p className="subsubsection-paragraph">
                The fundamental principle behind autoencoders is the ability to represent data in a reduced dimensionality while capturing its essential features. Consider, for instance, weather data
                 collected over time. Although we might record variables daily, the underlying patterns could be summarized by a few parameters, demonstrating that data can often be represented more 
                 efficiently. The key to autoencoders is not just in reducing dimensionality but in deliberately inducing information loss during compression. This loss is critical because it compels
                  the network to learn the most salient features of the data, enabling the decoder to reconstruct the input from an incomplete representation. Without this enforced information loss, 
                  the network could trivially pass the input through to the output without learning meaningful data representations.
                </p>   

                <p className="subsubsection-paragraph">
                In the context of autoencoders, the 'lower-dimensional space' refers to the latent space or bottleneck created by the hidden layers within the neural network. The dimensionality of
                 this space is a hyperparameter that determines the level of compression and abstraction the autoencoder achieves. It's this constrained bottleneck that forces the autoencoder to
                  prioritize which aspects of the input data are most crucial for reconstruction, thus learning efficient and meaningful representations of the data.

                    

                    <figure className="flex-container-caption">
                        <div className="flex-container"><img src={autoencoder_arch} alt="Broken" className="image-medium"/></div>
                        <figcaption>The canonical architecture for the vanilla autoencoder. The input is recreated as the output using the latent space as a way to force information loss; <a href="https://www.compthree.com/blog/autoencoder/" target="_blank" rel="noopener noreferrer">image source</a>.</figcaption>
                        </figure>

                    As an example, we could pass in a 
                    long piece of text and get a summarized form of it (numerically) through this embedding layer. Let's walk through an example of a semantic text similarity task where the goal is 
                    to see how similar two pieces of text are in meaning -- this is something that could be used when the goal is plagiarism detection, etc. 

                    <ol>

                        <li><strong>Data Preparation: </strong>The first step involves preparing your text data. This might include cleaning (removing stop words, punctuation, etc.), tokenization 
                        (splitting the text into tokens or words), and then encoding these tokens into numerical vectors. Encoding could be done using one-hot encoding, TF-IDF, word embeddings
                         (like Word2Vec or GloVe), or contextual embeddings from models like BERT.</li>

                        <li><strong>Designing the Autoencoder: </strong>The encoder part of the autoencoder takes the encoded text as input and compresses it into a lower-dimensional latent representation.
                         This involves passing the text through one or more dense (fully connected) layers or recurrent layers like LSTM units, especially useful for
                          capturing the sequential nature of text. The bottleneck layer holds the compressed representation of the input text, capturing its most essential semantic features.
                          The decoder part then attempts to reconstruct the original input from this bottleneck representation, using a structure that mirrors the encoder.</li>

                        
                        <li><strong>Training: </strong> The autoencoder is trained on a large corpus of text, with the goal of minimizing the reconstruction error between the original texts and their 
                        reconstructions from the bottleneck representations. This process allows the bottleneck layer to learn dense, semantically rich representations of the text.</li>

                        <li><strong>Extracting Text Representations: </strong> After training, the encoder part of the autoencoder can be used to transform any piece of text into its dense representation 
                        by passing it through the encoder and bottleneck layers. These representations capture the semantic essence of the text.</li>

                        <li><strong>Comparing: </strong>To assess the similarity between two pieces of text, you would convert both texts into their dense representations using the trained encoder. 
                        Then, you can use a similarity metric, such as cosine similarity, to quantify how similar these representations are, which serves as a proxy for semantic similarity.</li>

                    </ol>
                </p> 


                <h4>Denoising Autoencoders</h4>
                <p className="subsubsection-paragraph">

                    This is a variation that allows autoencoders to actually learn how to reconstruct inputs in such a way that it corrects problems that may exist within that input. For example, 
                    if the goal was to build a model that made images unblurry, what we could is take a set of images, make copies of them that are blurry, and pass those into an autoencoder; 
                    the trick is that the output layer would generate the reconstruction error as a function of the unblurry images -- this would adjust the networks weights in such a way that 
                    when a blurry image is passed in, its output will actually be a resconstruction that is not blurry! An NLP task solved with this kind of model is text correction; i.e. 
                    when we have errors in sentences on purpose, we could have the reconstruction error be a function of the corrected sentences. Here's a step by step look at how this would 
                    work: 

                    <ol>

                        <li><strong>Data Preparation: </strong>The first step involves preparing pairs of noisy and clean text samples. Noisy texts can be artificially generated by introducing errors
                         into clean texts or collected from sources known for having noisy data. Clean texts are error-free versions of these texts, either manually corrected or sourced from high-quality
                          datasets.</li>

                        <li><strong>Designing the Autoencoder: </strong> The encoder takes the noisy text as input, encoded into numerical vectors using techniques like one-hot encoding or word embeddings. 
                        The encoder processes this input through several layers (which can be dense, recurrent, or convolutional) to produce a compressed representation in the latent space. 
                        The bottleneck layer then captures the essential information from the noisy text, filtering out any noise. The decoder then takes this representation and reconstructs the clean
                         version of the text, ideally outputting text that is free from errors present in the input.</li>


                        <li><strong>Training: </strong> The DAE is trained on pairs of noisy and clean texts, learning to minimize the difference between the clean texts and their reconstructions from the noisy 
                        inputs. This process enables the model to learn how to correct various types of errors and normalize the text.</li>

                        <li><strong>Text Correction & Normalization: </strong> After training, the denoising autoencoder can be used for correcting new noisy texts. A noisy text is passed through the encoder to the bottleneck layer
                         and then through the decoder to produce a corrected version of the text. The effectiveness of the correction depends on the DAE's ability to learn meaningful transformations from noisy to clean texts.</li>

                        <li><strong>Evaluation: </strong>The quality of text correction can be evaluated by comparing the autoencoder's output with manually corrected texts, using metrics such as word error rate (WER), BLEU 
                        score, or edit distance, which quantify the differences between the corrected text and a reference clean text.</li>

                        </ol>

                </p>

                <h4>Variational Autoencoders</h4>
                <p className="subsubsection-paragraph">
                        Variational Autoencoders (VAEs) offer a probabilistic approach to generating and understanding complex data distributions. In NLP, 
                        VAEs are particularly valuable for their ability to model the underlying semantic and syntactic structures of language data. By learning to
                          encode text into a latent space with a defined probabilistic distribution, VAEs facilitate a range of generative and interpretative NLP tasks.
                    </p>
                    <p className="subsubsection-paragraph">
                        The core components of a VAE include an encoder, which maps input text to a distribution in latent space, and a decoder, which attempts to reconstruct the input from 
                        sampled latent variables. The encoder outputs parameters defining a distribution, typically the mean and variance of a Gaussian, encapsulated mathematically 
                        as <InlineMath math={"P(z|x)"} />, where <InlineMath math={"z"} /> represents the latent variables conditioned on an input <InlineMath math={"x"} />.
                    </p>
                    {/* <p className="subsubsection-paragraph">
                        A distinctive feature of VAEs in NLP is the reparameterization trick, which allows for efficient backpropagation by enabling sampling of latent variables in a differentiable 
                        manner. This trick maintains the stochastic nature of the model while ensuring the learning process remains robust.
                    </p> */}
                    <p className="subsubsection-paragraph">
                        The loss function of a VAE, known as the Evidence Lower Bound (ELBO), comprises two parts: the reconstruction loss, which ensures the decoded samples closely resemble the
                         original text, and the Kullback-Leibler (KL) divergence, a regularization term encouraging the latent distributions to approximate a prior distribution, often chosen to be 
                         a standard Gaussian.
                    </p>
                    <BlockMath math={"L(x, \\hat{x}) = -\\mathbb{E}_{q(z|x)}[\\log p(x|z)] + KL(q(z|x) \\| p(z))"} />
                    <p className="subsubsection-paragraph">
                        In NLP, training a VAE on large corpora of text enables the model to capture the intricate patterns and structures inherent in natural language. Once trained, the encoder part 
                        of the VAE can be used to generate dense, meaningful representations of text that can be leveraged for tasks such as text generation, where new text sequences are synthesized by 
                        sampling from the latent space, or for improving language models by introducing variability and handling uncertainty in text data.
                    </p>
                    <p className="subsubsection-paragraph">
                        The generative capability of VAEs also opens up possibilities for tasks like text style transfer, where the model learns to modify the style or tone of a given text while preserving
                         its original content, and for semi-supervised learning in NLP, where the model can make use of both labeled and unlabeled data to enhance learning outcomes.
                    </p>
                    <p className="subsubsection-paragraph">
                        Despite their potential, VAEs in NLP face challenges such as balancing the reconstruction quality with the regularity of the latent space and avoiding the posterior collapse problem, 
                        where the model might ignore the latent code. Addressing these challenges is crucial for fully harnessing the power of VAEs in natural language understanding and generation.
                    </p>

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                    <strong>Latent Space Dimensionality:</strong> This hyperparameter defines the size of the bottleneck layer, which is crucial for determining the level of compression and
                     abstraction achieved. A smaller latent space forces the autoencoder to learn more efficient representations, but may result in higher reconstruction error.
                    </li>
                    <li>
                    <strong>Type of Autoencoder:</strong> The architecture choice (e.g., Variational Autoencoder, Denoising Autoencoder, Sparse Autoencoder) influences the model's ability to 
                    learn representations. Each type introduces specific mechanisms, like variational inference or denoising criteria, that guide the learning process.
                    </li>
                    <li>
                    <strong>Sparsity Regularization:</strong> In sparse autoencoders, this term penalizes the activation of neurons in the hidden layers, encouraging the model to use only a small
                     number of active neurons at any time. This leads to more distinctive features being captured in the latent representation.
                    </li>
                    <li>
                    <strong>Reconstruction Loss Weight:</strong> In models like Variational Autoencoders, the balance between the reconstruction loss and the KL divergence term in the loss function
                     is critical. Adjusting this balance can significantly impact the model's performance and the quality of generated samples.
                    </li>
                    <li>
                    <strong>Noise Level:</strong> For Denoising Autoencoders, the degree of noise introduced to the input data during training is a key hyperparameter. It defines the model's
                     robustness and its ability to reconstruct clean data from noisy inputs.
                    </li>
                    <li>
                    <strong>KL Divergence Weight:</strong> Specifically in Variational Autoencoders, this weight controls the impact of the KL divergence term on the total loss, balancing the latent
                     space regularization with the reconstruction accuracy.
                    </li>
                </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Here's a Python example using TensorFlow to implement a basic Variational Autoencoder:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, RepeatVector, Dense

# Define a small dataset
sentences = [
    "The cat sat on the mat",
    "Dogs are amazing pets",
    "The sun is bright today",
    "I love reading books",
    "Data science is fascinating"
]

# Preprocess the data
def preprocess_sentences(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    max_sequence_length = max(len(s) for s in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
    return padded_sequences, tokenizer, max_sequence_length

padded_sequences, tokenizer, max_sequence_length = preprocess_sentences(sentences)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Build the autoencoder model
embedding_dim = 8

input_seq = Input(shape=(max_sequence_length,))
encoded = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)(input_seq)
encoded = LSTM(16)(encoded)
decoded = RepeatVector(max_sequence_length)(encoded)
decoded = LSTM(embedding_dim, return_sequences=True)(decoded)
decoded = Dense(vocab_size, activation='softmax')(decoded)

autoencoder = Model(input_seq, decoded)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
autoencoder.summary()

# Prepare the data for training
target_data = np.expand_dims(padded_sequences, -1)

# Train the autoencoder
autoencoder.fit(padded_sequences, target_data, epochs=100, batch_size=32)

# Function to encode and decode sentences
def encode_decode_sentence(sentence, tokenizer, autoencoder, max_sequence_length):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    prediction = autoencoder.predict(padded_sequence)
    decoded_sentence = ' '.join(tokenizer.index_word.get(np.argmax(word), '') for word in prediction[0])
    return decoded_sentence.strip()

# Test the autoencoder with a new sentence
test_sentence = "The sun is bright"
decoded_sentence = encode_decode_sentence(test_sentence, tokenizer, autoencoder, max_sequence_length)

print(f"Original sentence: {test_sentence}")
print(f"Decoded sentence: {decoded_sentence}")`}
                        </SyntaxHighlighter>
                    </p>
                </section>

                
                <section id="ntm1" className="code-cleaned">
                <h2>Neural Turing Machine</h2>
                <p className="subsubsection-paragraph">
                Neural Turing Machines (NTMs) are a class of neural network models that extend traditional neural networks with memory capabilities, enabling them to solve complex tasks that require
                 manipulation and storage of data over time. The NTM architecture combines a neural network controller with an external memory bank, much like the way a conventional Turing machine 
                 operates with a read/write head over a tape. The controller interacts with the memory through read and write operations, which are differentiable, allowing the entire system to be 
                 trained end-to-end with gradient descent.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Algorithm learning</span>,
                                    <span style={{ color: '#008000' }}> Pattern recognition</span>,
                                    <span style={{ color: '#ff4500' }}> Sequence prediction</span>,
                                    <span style={{ color: '#1e90ff' }}> Data classification</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    Implementation of NTMs is more research-oriented and often requires custom development; there are unofficial implementations in <span style={{ color: '#6a5acd' }}>TensorFlow</span> and <span style={{ color: '#20b2aa' }}>PyTorch</span> available on platforms like GitHub.
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    The computational complexity of NTMs is not straightforward due to their dynamic memory access and depends on the specific operations and the size of the memory matrix.
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"Neural Turing Machines"</span> by Alex Graves, Greg Wayne, and Ivo Danihelka, 2014; introduced the concept of NTMs, blending the worlds of neural networks and Turing machines.
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>



                <h4>NTM Foundations</h4>
                <p className="subsubsection-paragraph">
                   There are a few main components of an NTM, namely: 

                   <ul>
                    <li><strong>Controller: </strong> The core of an NTM is a neural network (often an LSTM or GRU) that acts as the controller. It takes as input the current input data and the information
                     read from the memory at the previous timestep, and outputs a set of instructions for reading from and writing to the memory, as well as the output for the current timestep.</li>

                    <li><strong>Memory: </strong>The memory is a matrix where each row represents a memory slot. The controller can read from and write to these slots based on the instructions 
                    generated by the controller.</li>

                    <li><strong>Read and Write Heads: </strong>These are mechanisms that define how the controller interacts with the memory. The read head fetches information from the memory, while
                     the write head updates the memory contents. The interactions are governed by attention mechanisms, allowing the model to focus on specific parts of the memory.</li>

                   </ul>
                </p>


                <p className="subsubsection-paragraph">

                <p>Let's denote the following:</p>
      
                        <ul>
                            <li>
                            <InlineMath math="M_t" /> as the memory matrix at time <InlineMath math="t" />,
                            </li>
                            <li>
                            <InlineMath math="\mathbf{r}_t" /> as the read vector,
                            </li>
                            <li>
                            <InlineMath math="\mathbf{w}_t" /> as the write vector,
                            </li>
                            <li>
                            <InlineMath math="\mathbf{e}_t" /> as the erase vector, and
                            </li>
                            <li>
                            <InlineMath math="\mathbf{a}_t" /> as the add vector.
                            </li>
                        </ul>
                    <p>
                        The read operation at time <InlineMath math="t" /> is defined by a weighted sum over the memory matrix:
                    </p>
                    <BlockMath math="\mathbf{r}_t = \sum_i w_t^r(i) M_t(i)" />
                    
                    <p>
                        Where <InlineMath math="w_t^r(i)" /> are the read weights for memory slot <InlineMath math="i" />,
                        determined by the read head's attention mechanism.
                    </p>
                    
                    <p>
                        Writing to the memory involves two steps, erasing and adding. The erase operation modifies the contents of the memory as follows:
                    </p>
                    <BlockMath math="M_t'(i) = M_{t-1}(i) \left[ \mathbf{1} - w_t^w(i) \mathbf{e}_t \right]" />
                    
                    <p>
                        Where <InlineMath math="M_t'(i)" /> is the intermediate memory state after erasing, <InlineMath math="w_t^w(i)" /> are the write weights,
                        and <InlineMath math="\mathbf{1}" /> is a vector of ones. Then, new information is added to the memory:
                    </p>
                    <BlockMath math="M_t(i) = M_t'(i) + w_t^w(i) \mathbf{a}_t" />
                    
                    <p>
                        The read and write weights <InlineMath math="w_t^r(i)" /> and <InlineMath math="w_t^w(i)" /> are typically computed using attention mechanisms,
                        such as content-based addressing and location-based addressing, allowing the model to focus on specific memory slots based on the content or relative location.
                    </p>
                </p>

                <p className="subsubsection-paragraph">
                The NTM's ability to manipulate its memory allows it to perform tasks that require maintaining and updating an internal state or remembering past information, 
                which is particularly useful in NLP for tasks like language modeling, parsing, and machine translation. Consider the task of predicting the next word in a sentence
                 given the previous words. An NTM can approach this task as follows:

                 <ul>
                    <li>
                    <strong>Initialization:</strong> The memory is initialized to some starting state, often zeros.
                    </li>
                    <li>
                    <strong>Reading Context:</strong> As the NTM processes each word in the sentence, it reads from the memory to retrieve relevant context or information from previous parts of the sentence.
                    </li>
                    <li>
                    <strong>Updating Memory:</strong> Based on the current word and the retrieved context, the NTM updates its memory to reflect new information or relationships learned from the current word.
                    </li>
                    <li>
                    <strong>Predicting Next Word:</strong> The controller uses the current word, the context from the memory, and its internal state to predict the next word in the sentence.
                    </li>
                    <li>
                    <strong>Loop:</strong> The process repeats for each word in the sentence, with the NTM updating its memory and making predictions at each step.
                    </li>
                </ul>
                <p>
                    The key advantage of NTMs in this scenario is their ability to explicitly store and retrieve information from different parts of the sentence, enabling them to capture long-range 
                    dependencies and complex relationships between words, which are challenging for conventional neural networks.
                </p>
                <p>
                    Training an NTM involves adjusting the parameters of the controller, read heads, and write heads to minimize the prediction error, typically using backpropagation through time.
                     The differentiable nature of the read and write operations allows gradients to flow from the output back to the memory and controller, enabling end-to-end training.
                </p>


                </p>

                <h4>Differentiable Neural Computers</h4>
                    <p className="subsubsection-paragraph">
                        Differentiable Neural Computers (DNCs) are an extension of NTMs, introduced to overcome some of their limitations, particularly in terms of memory management and 
                        scaling. DNCs introduce a more sophisticated memory allocation and retrieval mechanism, allowing for dynamic memory management and improved memory utilization. They 
                        retain the NTM's basic structure but add mechanisms like dynamic memory allocation and temporal memory linkage, enabling more complex data manipulations and efficient 
                        learning of data structures.
                    </p>

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                    <strong>Memory Size (</strong><InlineMath math="N \times M" /><strong>):</strong> The dimensions of the memory matrix, where <InlineMath math="N" /> is the number of memory 
                    slots and <InlineMath math="M" /> is the vector size of each memory slot.
                    </li>
                    <li>
                    <strong>Controller Size:</strong> The size and architecture of the neural network controller (e.g., number of layers and units in LSTM/GRU).
                    </li>
                    <li>
                    <strong>Read Heads:</strong> The number of read heads, which determines how many distinct memory locations can be read simultaneously at each timestep.
                    </li>
                    <li>
                    <strong>Write Heads:</strong> The number of write heads, which influences how many memory locations can be written to or updated simultaneously.
                    </li>
                    <li>
                    <strong>Learning Rate:</strong> The step size at each iteration while moving toward a minimum of a loss function, crucial for training the NTM's parameters.
                    </li>
                    <li>
                    <strong>Addressing Mechanism Parameters:</strong> Parameters that control the read and write heads' focus, including those for content-based and location-based addressing.
                    </li>
                </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    Implementing an NTM in Python requires constructing the controller and memory components, along with the mechanisms for reading from and writing to the memory. Here's a 
                    simplified conceptual example using TensorFlow:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`# Example is conceptual and focuses on the architecture components
import tensorflow as tf

class NTMCell(tf.keras.layers.Layer):
    def __init__(self, controller_units, memory_size, memory_vector_dim):
        super(NTMCell, self).__init__()
        self.controller = tf.keras.layers.LSTMCell(controller_units)
        self.memory_size = memory_size
        self.memory_vector_dim = memory_vector_dim
        self.read_head = ReadHead(memory_size, memory_vector_dim)
        self.write_head = WriteHead(memory_size, memory_vector_dim)

    def call(self, x, states):
        # Controller operations
        controller_output, controller_state = self.controller(x, states['controller_state'])
        # Memory operations
        read_vector = self.read_head(controller_output, states['memory'])
        write_vector = self.write_head(controller_output, states['memory'])
        # Update memory
        new_memory = self.update_memory(states['memory'], read_vector, write_vector)
        return controller_output, new_memory, controller_state

# NTMCell would be used as part of a larger model, integrated with input and output processing

# The ReadHead and WriteHead classes, along with the memory update mechanisms, are not detailed here
# but involve implementing the addressing and manipulation logic of the NTM's memory matrix.
`}
                        </SyntaxHighlighter>
                    </p>

                </section>



                <section id="ntm2" className="code-cleaned">
    <h2>Neural Topic Modelling</h2>
    <p className="subsubsection-paragraph">
        Neural Topic Modeling integrates neural networks into traditional topic modeling, enhancing the ability to discover abstract topics from a collection of documents. This
         approach leverages the representational learning and generalization capabilities of neural networks, offering improvements over classical methods like Latent Dirichlet 
         Allocation in terms of flexibility and performance.
    </p>

                <p className="subsubsection-paragraph">
                <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                    <tbody>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span style={{ color: '#333399' }}>Content categorization</span>,
                                <span style={{ color: '#008000' }}> Information retrieval</span>,
                                <span style={{ color: '#ff4500' }}> Document clustering</span>,
                                <span style={{ color: '#1e90ff' }}> Trend analysis</span>
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                Neural topic modeling often involves custom architectures; however, libraries like <span style={{ color: '#6a5acd' }}>TensorFlow</span> and <span style={{ color: '#20b2aa' }}>PyTorch</span> provide foundational tools for building such models. High-level APIs like <span style={{ color: '#ff6347' }}>Keras</span> can also be utilized for simpler implementations.
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                The computational complexity can vary significantly based on the architecture, such as the number of layers and the size of the input documents. Generally, it involves matrix operations whose complexity can be estimated as <span>O(n*m)</span>, where <i>n</i> is the number of documents and <i>m</i> is the number of features or hidden units.
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                A significant paper in this area is <span>"Neural Variational Inference for Text Processing"</span> by Miao, Yu, and Blunsom, 2016, which discusses using neural networks for topic modeling and other NLP tasks.
                            </td>
                        </tr>
                    </tbody>
                </table>
            </p>


    <h4>Neural Topic Modelling Foundations</h4>
    <p className="subsubsection-paragraph">
      
      <p>Neural topic modeling is an approach that leverages neural networks to discover abstract topics from a collection of documents. It's a modern take on traditional topic modeling methods
         like Latent Dirichlet Allocation, aiming to improve flexibility, scalability, and the quality of learned topic representations. Neural topic models can automatically learn intricate
          patterns in data, making them capable of handling large and complex text datasets.</p>
      
      <p>At its core, topic modeling is about finding a way to summarize a large collection of text documents through a set of topics, where each topic represents a collection of words that frequently 
        occur together. For instance, in a collection of news articles, you might find topics related to politics, sports, technology, etc. Each document in the dataset can be viewed as a mixture of these 
        topics.</p>

      <p>Neural topic models approach this task by training a neural network to learn representations of documents in such a way that documents sharing similar themes are close to each other in a learned
         topic space. This is achieved by encoding documents into dense vectors (embeddings) that capture semantic meanings and thematic structures.</p>
      

      <p>The foundation of neural topic modeling involves several key components, typically including an encoder, a topic distribution layer, and sometimes a decoder, depending on
         the specific architecture (e.g., Variational Autoencoders for topic modeling). The way they are built, they can be one of any number of neural network architectures.</p>
      
      <p>The encoder part of a neural topic model transforms documents into compact representations. Given a document <InlineMath math="d" /> represented as a bag-of-words 
      vector <InlineMath math="\mathbf{x}" />, the encoder, often a neural network, maps <InlineMath math="\mathbf{x}" /> to a latent topic distribution <InlineMath math="\mathbf{z}" />:</p>
      <BlockMath math="\mathbf{z} = f_{\text{encoder}}(\mathbf{x}; \theta)" />
      
      <p>The latent topic distribution <InlineMath math="\mathbf{z}" /> represents the document's composition of topics. In many models, <InlineMath math="\mathbf{z}" /> is constrained to be a distribution
       using a softmax function or a Dirichlet distribution to ensure that the elements of <InlineMath math="\mathbf{z}" /> sum up to 1 and are positive, representing probabilities:</p>
      <BlockMath math="\mathbf{z} \sim \text{Dirichlet}(\alpha) \quad \text{or} \quad \mathbf{z} = \text{softmax}(\mathbf{h})" />
      
      <p>Some neural topic models, like those based on autoencoder architectures, include a decoder that attempts to reconstruct the original document from the topic distribution:</p>
      <BlockMath math="\hat{\mathbf{x}} = f_{\text{decoder}}(\mathbf{z}; \phi)" />
      
      <p>The model is trained to maximize the likelihood of observing the documents given their topic distributions, often using variational inference for models that incorporate latent variables with 
        complex posterior distributions. The objective function can include terms that encourage the model to learn meaningful and distinct topics, such as sparsity constraints or regularization terms.</p>
      
      <ul>
        <li><strong>Encoding:</strong> Converts raw text data into a form that captures the underlying semantic and thematic structures. This step is crucial for distilling the essence of each document 
        into a dense vector that reflects its topic composition.</li>
        <li><strong>Topic Distribution:</strong> Represents each document in the context of the entire corpus, highlighting its thematic affiliations. This distribution is the model's way of summarizing
         the main themes present in a document.</li>
        <li><strong>Decoding (if applicable):</strong> Serves as a way to validate the model's understanding of the topics. By attempting to reconstruct the original document from its topic distribution,
         the model demonstrates its ability to use the learned topics to capture the essential content of the documents.</li>
      </ul>
      
      <p>Consider a corpus of news articles. A neural topic model might learn topics corresponding to international relations, technology, sports, and health. For a given article about a recent technology
         conference, the model might represent it with a high probability assigned to the technology topic, and smaller probabilities for the other topics. This representation could then be used for tasks like document classification, recommendation systems, or content summarization, providing a powerful tool for organizing and understanding large text datasets.</p>
      
      {/* <p>Neural topic models, with their ability to learn nuanced patterns and scale to large datasets, offer a sophisticated approach to uncovering the latent thematic structure in text, making them a
         valuable tool in the NLP toolkit.</p> */}
    </p>

    <h4>Hyperparameters</h4>
    <p className="subsubsection-paragraph">
    <ul>
        <li>
          <strong>Topic Dimensionality:</strong> The number of topics <InlineMath math="K" /> the model is expected to learn. This defines the size of the topic distribution vector <InlineMath math="\mathbf{z}" /> for each document.
        </li>
        <li>
          <strong>Encoder Architecture:</strong> The configuration of the neural network used as the encoder, including the type (e.g., feedforward, LSTM, Transformer) and size (number of layers and units per layer).
        </li>
        <li>
          <strong>Decoder Architecture (if applicable):</strong> Similar to the encoder, for models that include a decoder, this refers to the type and size of the neural network used for decoding.
        </li>
        <li>
          <strong>Dirichlet Concentration Parameter <InlineMath math="\alpha" />:</strong> For models that use a Dirichlet distribution to model topic distributions, <InlineMath math="\alpha" /> controls the sparsity/density of the topic distribution across documents.
        </li>
        <li>
          <strong>Word Embedding Dimensionality:</strong> The size of the word embeddings used to represent individual words in the vocabulary. This affects the input layer size for the encoder.
        </li>
        <li>
          <strong>Reconstruction Loss Weight:</strong> In models that include a reconstruction objective (like autoencoder-based models), this parameter balances the topic modeling loss with the reconstruction loss.
        </li>
      </ul>
    </p>

    <h4>In Code</h4>
    <p className="subsubsection-paragraph">
        Implementing a basic neural topic model can be illustrated with a Python example using TensorFlow and Keras:
        <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
max_words = 10000  # Vocabulary size
max_len = 500  # Maximum length of a document
(x_train, _), (x_test, _) = reuters.load_data(num_words=max_words, maxlen=max_len)
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# Convert to one-hot encoding
def one_hot_sequences(sequences, dimension=max_words):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = one_hot_sequences(x_train)
x_test = one_hot_sequences(x_test)

# Model parameters
embedding_dim = 64  # Size of the document embedding
topic_dim = 10  # Number of topics to learn

# Neural topic model
input_doc = layers.Input(shape=(max_words,))
encoded = layers.Dense(embedding_dim, activation='relu')(input_doc)
topic_distribution = layers.Dense(topic_dim, activation='softmax', name='topic_distribution')(encoded)
decoded = layers.Dense(max_words, activation='sigmoid')(topic_distribution)

model = Model(input_doc, decoded)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Train the model
model.fit(x_train, x_train, epochs=10, batch_size=64, validation_data=(x_test, x_test))

# Extract topic distribution for new documents
topic_model = Model(input_doc, topic_distribution)

# Example: Get topic distribution for the first document in the test set
topic_distribution_output = topic_model.predict(x_test[:1])
print("Topic distribution for the first document:", topic_distribution_output)`}
                        </SyntaxHighlighter>
                    </p>
                </section>

                
                
                <div className="subsubsection-navigation">
                    <Link to="/ml/seq2seq">← Seq2Seq</Link>
                    <Link to="/ml/transfer">Transfer Learning →</Link>
                </div>
            </main>
            
            <Footer />
        </div>
    );
}

export default AdvancedML;
