import React from 'react';
import '../../styles/subsection.css';
import Header from '../../components/Header';
import Footer from '../../components/Footer';
import { Link } from 'react-router-dom';
import 'katex/dist/katex.min.css';
import { InlineMath, BlockMath } from 'react-katex';
import { LightAsync as SyntaxHighlighter } from 'react-syntax-highlighter';
import { docco } from 'react-syntax-highlighter/dist/esm/styles/hljs';

import word2vec_ohe from '../../media/ClassicNLP/word2vec_ohe.png';
import word2vec_arch from '../../media/ClassicNLP/word2vec_arch.png';
import word2vec_pairs from '../../media/ClassicNLP/word2vec_pairs.png';
// import glove_matrix from '../../media/ClassicNLP/glove_matrix.png';
// import hmm_transition from '../../media/ClassicNLP/hmm_hidden.png';
// import hmm_emission from '../../media/ClassicNLP/hmm_emission.png';

function ClassicML() {
    return (
        <div className="subsubsection-container">
            <Header />
            <div class="side-nav-container">
                <aside className="subsubsection-side-nav">
                    <a href="#hmm">Hidden Markov Models</a>
                    <a href="#lda">Latent Dirichlet Allocation</a>
                    <a href="#word2vec">Word2Vec & Doc2Vec</a>
                    <a href="#glove">GloVe</a>
                </aside>
            </div>
            
            <main className="subsubsection-content">
                <div className="titles"><h1>Classical NLP Methods</h1></div>

                <section id="hmm"  className="code-cleaned">
                <h2>Hidden Markov Models</h2>
                <p className="subsubsection-paragraph">
                    Hidden Markov Models (HMMs) are statistical models which output a sequence of symbols or quantities. They are especially known for their application in temporal pattern recognition
                     such as speech, handwriting, gesture recognition, part-of-speech tagging, and bioinformatics.
                </p>

                <p className="subsubsection-paragraph">
                    <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                        <tbody>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#333399' }}>Part-of-speech tagging</span>,
                                    <span style={{ color: '#008000' }}> Speech recognition</span>,
                                    <span style={{ color: '#ff4500' }}> Named entity recognition</span>,
                                    <span style={{ color: '#1e90ff' }}> Text generation</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span style={{ color: '#6a5acd' }}>hmmlearn</span>,
                                    <span style={{ color: '#20b2aa' }}> NLTK (nltk.tag.hmm)</span>
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>O(N^2T)</span>, where <i>N</i> is the number of states and <i>T</i> is the length of the observed sequence
                                </td>
                            </tr>
                            <tr>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                    <span>"A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition"</span> by Lawrence R. Rabiner, 1989
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </p>


                <h4>HMM Foundations</h4>
                <p className="subsubsection-paragraph">
                    An HMM is characterized by five elements: a set of states, a set of observations, a start probability (probability of the start state), transition probabilities (probability of 
                    transitioning from one state to another), and emission probabilities (probability of an observation being generated from a state). The core assumption in an HMM is that the 
                    probability of each output observation only depends on the current state. This is represented mathematically as:
                    <BlockMath math="P(o_t | s_t, s_{t-1}, ..., s_1) = P(o_t | s_t)" />
                    where <InlineMath math="o_t" /> is the observation at time <InlineMath math="t" /> and <InlineMath math="s_t" /> is the hidden state at time <InlineMath math="t"/>.
                </p>

                <p className="subsubsection-paragraph">
                    To explain this model without getting too bogged down in the math, let's just focus on a specific example with the task of POS-Tagging. We are going to take a sentence and work through 
                    how the model would figure out what type of word (noun, verb, or adjective) each word is in the sentence "That person is great at running". Consider the following components:
                    <ul>
                        <li><strong>Initial State Probabilities:</strong> The probability of each POS tag being the start of a sentence. For simplicity, let's assume we have three tags: Noun (N), Verb (V), and Adjective (Adj).</li>
                        <li><strong>Transition Probabilities:</strong> The probability of transitioning from one POS tag to another. For example, the probability of a noun being followed by a verb.</li>
                        <li><strong>Emission Probabilities:</strong> The probability of a word given a POS tag. For example, the probability of the word "running" being a verb. </li>    
                    </ul>

                    Now, imagine this is the initial state (or start probability):
                    <ul>
                        <li>Noun (N): 0.3</li>
                        <li>Verb (V): 0.2</li>
                        <li>Adjective (Adj): 0.5</li>
                    </ul>

                    These are the hidden state probabilities: <br/>

                    <table class="centered-table">
                        <tr>
                            <th>From\To</th>
                            <th>N</th>
                            <th>V</th>
                            <th>Adj</th>
                        </tr>
                        <tr>
                            <td>N</td>
                            <td>0.1</td>
                            <td>0.6</td>
                            <td>0.3</td>
                        </tr>
                        <tr>
                            <td>V</td>
                            <td>0.4</td>
                            <td>0.1</td>
                            <td>0.5</td>
                        </tr>
                        <tr>
                            <td>Adj</td>
                            <td>0.7</td>
                            <td>0.2</td>
                            <td>0.1</td>
                        </tr>
                        </table>


                        And these are the emission probabilities: 

                        <table class="centered-table">
                            <tr>
                                <th>Word</th>
                                <th>N</th>
                                <th>V</th>
                                <th>Adj</th>
                            </tr>

                            <tr>
                                <td>that</td>
                                <td>0.1</td>
                                <td>0.1</td>
                                <td>0.8</td>
                            </tr>
                            <tr>
                                <td>person</td>
                                <td>0.8</td>
                                <td>0.1</td>
                                <td>0.1</td>
                            </tr>
                            <tr>
                                <td>is</td>
                                <td>0.1</td>
                                <td>0.8</td>
                                <td>0.1</td>
                            </tr>
                            <tr>
                                <td>great</td>
                                <td>0.1</td>
                                <td>0.1</td>
                                <td>0.8</td>
                            </tr>
                            <tr>
                                <td>running</td>
                                <td>0.1</td>
                                <td>0.8</td>
                                <td>0.1</td>
                            </tr>
                            </table>

                Start with the initial state probabilities. For the first word "That", we would look at the emission probability for each tag 
                and multiply it by the initial state probabilities. Choose the highest product for the first tag. Let's use some made up 
                numbers here to illustrate this (I will discuss later how these probabilities are found):

                <ul>
                    <li>Noun (N): 0.3</li>
                    <li>Verb (V): 0.2</li>
                    <li>Adjective (Adj): 0.5</li>
                </ul>

                <p>Emission Probabilities for "That": </p>
                <ul>
                    <li>N: 0.1</li>
                    <li>V: 0.1</li>
                    <li>Adj: 0.8</li>
                </ul>

                <p>For the first word "That", you calculate:</p>
                <ul>
                    <li>
                    For Noun: <InlineMath math="0.3 \times 0.1 = 0.03" />
                    </li>
                    <li>
                    For Verb: <InlineMath math="0.2 \times 0.1 = 0.02" />
                    </li>
                    <li>
                    For Adjective: <InlineMath math="0.5 \times 0.8 = 0.4" />
                    </li>
                </ul>

              
                    The highest probability is for the Adjective tag, so "That" is most likely an Adjective given these probabilities. This conclusion kicks off the process, and subsequent words 
                    will consider the transition probabilities from "That" being an Adjective to whatever the next tag might be, in addition to the emission probabilities of the next word. 
                               For each subsequent word, calculate the probability of each tag by considering: 
            <ul>
                <li>The emission probability of the word for that tag.</li>
                <li>The transition probability from the previously chosen tag to the current tag.</li>
                <li>Multiply these probabilities and choose the tag with the highest value.</li>
            </ul>

            After reaching the last word, backtrack to find the most likely sequence of tags that led to the final choice.
                
                
                </p>

                <p className="subsubsection-paragraph">So, to summarize, assuming that "That" was tagged as Adj, to find the tag for "person":
                    <ul>
                        <li>
                        Calculate the product of the transition probability from Adj to N, V, and Adj respectively, with the emission probability of "person" for each tag.
                        </li>
                        <li>
                        Assume transition probabilities from Adj as Adj-<InlineMath math = ">"/>N: 0.7, Adj-<InlineMath math = ">"/>V: 0.2, Adj-<InlineMath math = ">"/>Adj: 0.1, 
                        and emission probabilities for "person" as N: 0.8, V: 0.1, Adj: 0.1.
                        </li>
                        <li>
                        The product for N would be <InlineMath math="0.7 \times 0.8 = 0.56" />.
                        </li>
                        <li>Repeat for V and Adj, and choose the tag with the highest product.</li>
                    </ul>

                    In practice, these probabilities are estimated from a tagged corpus (a dataset where each word is labeled with its correct POS tag). Techniques like Maximum Likelihood Estimation 
                    can be used to calculate the probabilities based on the frequency of occurrences and transitions in the training data. Once this is done, they model is essentially "trained".
                    The complexity of HMMs primarily lies not in estimating these parameters, which is relatively straightforward given a sufficiently large and representative labeled dataset, 
                    but in using these parameters to infer the most likely sequence of hidden states (in this case, part-of-speech tags) for new, unseen sequences of observed states (words in a sentence). 

                 The inference process involves calculations like:
                        <ol>
                            <li>
                            Determining the most likely initial state based on the initial state probabilities and the emission probabilities of the first observed word.
                            </li>
                            <li>
                            Iteratively calculating, for each subsequent word, the most likely tag considering both the emission probabilities of the observed word for each tag and the transition probabilities from all possible previous tags.
                            </li>
                            <li>
                            Employing algorithms like the Viterbi algorithm, which efficiently finds the most likely sequence of hidden states by using dynamic programming to keep track of the most likely path to each state at each step in the sequence.
                            </li>
                        </ol>
                        I won't get into the details of these algorithms but feel free to learn them on your own! 
                    </p>

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                        <b>Number of States:</b> The total number of hidden states in the model, often denoted as <InlineMath math="N" />. These states represent the underlying processes that result in the observed data (e.g., part-of-speech tags in NLP).
                    </li>
                    <li>
                        <b>State Transition Probabilities:</b> The probabilities of transitioning from one state to another, represented in a matrix <InlineMath math="A" /> of size <InlineMath math="N \times N" />. Each entry <InlineMath math="a_{ij}" /> denotes the probability of moving from state <InlineMath math="i" /> to state <InlineMath math="j" />.
                    </li>
                    <li>
                        <b>Emission Probabilities:</b> The probabilities of observing each possible observable given each hidden state, represented in a matrix <InlineMath math="B" /> of size <InlineMath math="N \times M" />, where <InlineMath math="M" /> is the number of possible observations. Each entry <InlineMath math="b_{ij}" /> denotes the probability of observing <InlineMath math="j" /> from state <InlineMath math="i" />.
                    </li>
                    <li>
                        <b>Initial State Probabilities:</b> The initial distribution of states, represented as a vector <InlineMath math="\pi" /> of size <InlineMath math="N" />. Each entry <InlineMath math="\pi_i" /> denotes the probability that the <InlineMath math="i^{th}" /> state is the initial state.
                    </li>
                    </ul>
    
                    These hyperparameters are typically estimated from training data using algorithms like the Baum-Welch algorithm (a form of the Expectation-Maximization algorithm) for unsupervised 
                    learning, or set directly in supervised learning contexts where the state sequences are known.
                
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
A Python example for a simple HMM application might involve using libraries like 'hmmlearn' 
                     for modeling and inference:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`from hmmlearn import hmm
import numpy as np

# Example: Modeling a simple weather system
states = ["Sunny", "Rainy"]
n_states = len(states)

observations = ["walk", "shop", "clean"]
n_observations = len(observations)

# Start probability
start_probability = np.array([0.6, 0.4])

# Transition probability
transition_probability = np.array([
  [0.7, 0.3],
  [0.4, 0.6]
])

# Emission probability
emission_probability = np.array([
  [0.3, 0.4, 0.3],
  [0.1, 0.3, 0.6]
])

# Create HMM
model = hmm.MultinomialHMM(n_components=n_states, n_trials=n_observations)
model.startprob_ = start_probability
model.transmat_ = transition_probability
model.emissionprob_ = emission_probability

# Observation sequence
# Each observation is a one-hot encoded vector representing "walk", "shop", "clean"
obs_seq = np.array([
    [1, 0, 0],  # walk
    [0, 0, 1],  # clean
    [0, 1, 0],  # shop
    [0, 1, 0],  # shop
    [0, 0, 1],  # clean
    [1, 0, 0]   # walk
])

# Predict the hidden states of the given observation sequence
logprob, states_seq = model.decode(obs_seq, algorithm="viterbi")

# Map the state indices to state names
state_names = [states[state_idx] for state_idx in states_seq]

print("The states are:", ", ".join(state_names))`}
                        </SyntaxHighlighter>
                    </p>
                </section>

                
                <section id="lda" className="code-cleaned">
                <h2>Latent Dirichlet Allocation</h2>
                <p className="subsubsection-paragraph">
                    Latent Dirichlet Allocation (LDA) is a sophisticated generative statistical model designed to uncover hidden thematic structures within large collections of text data. By 
                    identifying unobserved groupings, known as topics, LDA facilitates a deeper understanding of the underlying themes that pervade a corpus, making it invaluable for tasks such as
                     document classification, information retrieval, and content summarization.
                </p>

                <p className="subsubsection-paragraph">
                <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                    <tbody>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span style={{ color: '#333399' }}>Topic modeling</span>,
                                <span style={{ color: '#008000' }}> Document classification</span>,
                                <span style={{ color: '#ff4500' }}> Content recommendation</span>,
                                <span style={{ color: '#1e90ff' }}> Information retrieval</span>
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span style={{ color: '#6a5acd' }}>Gensim (gensim.models.LdaModel)</span>,
                                <span style={{ color: '#20b2aa' }}> Scikit-learn (sklearn.decomposition. LatentDirichletAllocation)</span>
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                Dependent on the algorithm used for inference; typically <span>O(kn)</span>, where <i>k</i> is the number of topics and <i>n</i> is the number of unique words
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span>"Latent Dirichlet Allocation"</span> by David M. Blei, Andrew Y. Ng, and Michael I. Jordan, 2003
                            </td>
                        </tr>
                    </tbody>
                </table>
            </p>


                <h4>LDA Foundations</h4>
                <p className="subsubsection-paragraph">
                    At the heart of LDA lies the Dirichlet distribution, a family of continuous multivariate probability distributions characterized by a vector of positive reals. This distribution is 
                    crucial for LDA as it acts as a prior for both the distribution of topics within documents and the distribution of words within topics. This mathematical foundation supports the
                     intuitive notion that documents typically discuss a limited array of topics and that topics, in turn, are represented by a select set of words used recurrently.
                    <BlockMath math="Dir(\boldsymbol{\alpha}) = \frac{1}{B(\boldsymbol{\alpha})} \prod_{i=1}^{K} \theta_{i}^{\alpha_i - 1}" />
                    Here, <InlineMath math="\boldsymbol{\alpha}" /> denotes the parameter vector that influences the distribution's shape, <InlineMath math="\theta_{i}" /> represents the topic proportions
                     within a document, and <InlineMath math="K" /> signifies the total number of topics to be discovered.
                </p>

                <p className="subsubsection-paragraph">
                    The essence of LDA is to model each document as a mixture of various topics, where a topic is defined by a distribution over words. This model posits that documents are generated through
                     a probabilistic process that includes selecting topic distributions for each document and word distributions for each topic, guided by the Dirichlet priors.
                    <BlockMath math="P(\boldsymbol{W}, \boldsymbol{Z}, \boldsymbol{\theta}, \boldsymbol{\varphi} ; \alpha, \beta) = \prod_{j=1}^{M} P\left(\theta_{j} ; \alpha\right) \prod_{i=1}^{K} P\left(\varphi_{i} ; \beta\right) \prod_{t=1}^{N} P\left(Z_{j, t} \mid \theta_{j}\right) P\left(W_{j, t} \mid \varphi_{Z_{j, t}}\right)" />
                    In this formulation, <InlineMath math="\boldsymbol{W}" /> and <InlineMath math="\boldsymbol{Z}" /> represent the observed words and their corresponding topic assignments, respectively, 
                    while <InlineMath math="\boldsymbol{\theta}" /> and <InlineMath math="\boldsymbol{\varphi}" /> denote the topic distributions for documents and word distributions for topics. The goal
                     of LDA is to infer the optimal latent topic structure that maximizes the likelihood of the observed data, thereby revealing the most coherent and distinct topics.
                </p>

                <p className="subsubsection-paragraph">
                    Imagine a corpus consisting of articles from diverse fields such as sports, technology, and culinary arts. LDA can be applied to this corpus to discover topics that might include specific
                     themes like soccer, software development, and baking. For instance, in a soccer-related article, the model might identify a distribution where terms such as "goal", "match", and "player"
                      have high probabilities under one topic, while words like "recipe", "oven", and "cake" might dominate another topic related to baking. The power of LDA lies in its ability to automatically
                       uncover these thematic structures without any prior labeling, providing valuable insights into the predominant subjects and their proportions within each document. Feel free to look 
                       into the original paper to learn more about this. 
                </p>

                {/* <p className="subsubsection-paragraph">
                    In practice, the application of LDA extends beyond just topic identification; it can enhance information retrieval systems by improving document tagging and categorization, aid content
                     recommendation engines in understanding user preferences, and even support the summarization of large text corpora by capturing the key topics. The flexibility and effectiveness of LDA 
                     make it a cornerstone technique in the domain of natural language processing and text mining.
                </p> */}

                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">
                <ul>
                    <li>
                        <b>Number of Topics (<InlineMath math="K" />)</b>: The number of distinct topics to be extracted from the corpus. Choosing the right number of topics is crucial for capturing the
                         underlying thematic structure without overfitting or underfitting.
                    </li>
                    <li>
                        <b>Dirichlet Prior on Document-Topic Distributions (<InlineMath math="\alpha" />)</b>: A parameter that influences the distribution of topics within documents. Higher values 
                        lead to documents containing a mixture of more topics, whereas lower values encourage documents to be composed of fewer topics.
                    </li>
                    <li>
                        <b>Dirichlet Prior on Topic-Word Distributions (<InlineMath math="\beta" />)</b>: A parameter that affects the distribution of words within topics. Higher values result in topics
                         that are represented by a broader array of words, while lower values make topics more focused on a smaller set of words.
                    </li>
                    <li>
                        <b>Maximum Iterations</b>: The maximum number of iterations to run the algorithm for. More iterations allow for better convergence at the cost of increased computation time.
                    </li>
                    <li>
                        <b>Document Update Interval</b>: The number of documents to pass through before updating the model's parameters. This can affect the speed and quality of convergence.
                    </li>
                    <li>
                        <b>Topic Threshold</b>: A threshold that determines the minimum probability contribution a topic must have to be considered relevant for a document. This can help in filtering out 
                        topics with negligible presence in a document.
                    </li>
                </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    LDA can be implemented in Python using libraries like Gensim. A common application is to discover the thematic structure in a large corpus of text data. Here's an example:
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`from gensim import corpora, models
import gensim

# Sample documents
doc_a = "The cat sat on the hat"
doc_b = "The dog ate the cat and the hat"
# Compile documents
doc_set = [doc_a, doc_b]

# Tokenize documents
texts = [doc.split() for doc in doc_set]

# Create a dictionary from the tokens
dictionary = corpora.Dictionary(texts)

# Convert to bag-of-words format
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

# Print topics
for topic in ldamodel.print_topics(num_topics=2, num_words=3):
    print(topic)`}
                        </SyntaxHighlighter>
                    </p>
                </section>



                <section id="word2vec"  className="code-cleaned">
                <h2>Word2Vec & Doc2Vec</h2>
                <p className="subsubsection-paragraph">
                    Word2Vec (also known as neural word embeddings) and Doc2Vec are algorithms used to produce word embeddings, which are vector representations of words and documents. These 
                    models capture semantic relationships between words and can be used for various NLP tasks.
                </p>

                <p className="subsubsection-paragraph">
                <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                    <tbody>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span style={{ color: '#333399' }}>Semantic analysis</span>,
                                <span style={{ color: '#008000' }}> Document similarity</span>,
                                <span style={{ color: '#ff4500' }}> Language translation</span>,
                                <span style={{ color: '#1e90ff' }}> Feature generation for machine learning models</span>
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span style={{ color: '#6a5acd' }}>Gensim (gensim.models.Word2Vec, gensim.models.Doc2Vec)</span>,
                                <span style={{ color: '#20b2aa' }}> spaCy (for Word2Vec embeddings)</span>
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                Typically <span>O(v*n)</span>, where <i>v</i> is the number of words in the vocabulary and <i>n</i> is the dimensionality of the feature vectors
                            </td>
                        </tr>
                        <tr>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                            <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                <span>"Efficient Estimation of Word Representations in Vector Space"</span> by Mikolov et al., 2013 for Word2Vec; 
                                <span> "Distributed Representations of Sentences and Documents"</span> by Le and Mikolov, 2014 for Doc2Vec
                            </td>
                        </tr>
                    </tbody>
                </table>
            </p>




                <h4>Word2Vec Foundations</h4>
                <p className="subsubsection-paragraph">
                    Word2Vec is a clever utilization of neural networks. When we develop neural networks, we usually care about the ouput because we are generally looking to predict something
                    however, in the case of Word2Vec, where the goal is to generate embeddings for words, we aren't concerned with the output but rather, we care about where we can 
                    extract those embeddings. In Word2Vec, instead of caring about the prediction, we look at the hidden layer of the neural network. Let's clarify this with an example!
                </p>

                <p className="subsubsection-paragraph">
                    Imagine a generic case where you are trying to find similar hashtags on Twitter to the one you input. In this case, your dictionary will be every hashtag used within your
                    dataset and each document would be a particular post. The corpus would be the collection of all posts. We can represent each hashtag in a corpus as an <InlineMath math="d" />-dimensional vector. 
                    The size of <InlineMath math="d" /> is left as a hyperparameter that corresponds to the number of neurons in the first hidden layer of a feed-forward neural network. Data is organized in pairs with 
                    one of the two values being the input to the neural network and the other being the target variable. The pairs are constructed within documents. <br />
                </p>
                
                <p className="subsubsection-paragraph">
                
                <figure className="flex-container-caption">
                <div className="flex-container"><img src={word2vec_ohe} alt="Broken" className="image-medium"/></div>
                        <figcaption>This can be the one-hot encoded input into a Word2Vec neural network.</figcaption>
                        </figure>



                The usual pre-processing is also applied including one-hot encoding to input the hashtags into the network. The (ordered) input vector will have a length equal to the number of unique words 
                in the corpus i.e. the unique hashtags across all Instagram posts, say <InlineMath math = "n" />. The target variable will be situated similarly and a softmax activation function will determine the probability 
                of relation between the input vector and the other words in the corpus.

                </p>



                <p className='subsubsection-paragraph'>
                <figure className="flex-container-caption">
                    <div className="flex-container"><img src={word2vec_pairs} alt="Broken" className="image-medium"/></div>
                        <figcaption>These are some examples of input/output pairs we could make from a given piece of text.</figcaption>
                        </figure>

                        </p>

                <p className='subsubsection-paragraph'>

                We are more interested in the single hidden layer than we are in the output of the neural network. The purpose of the output here is to train the model but we are not concerned with making 
                new predictions with the model. Instead, we use the hidden layer as the co-ordinates for a <InlineMath math="d" />-dimensional embedding space. This is possible because the input vector is a set of 0's except 
                for a 1 corresponding to the single input hashtag. This means that every neuron in the hidden layer is a transformation of a linear combination with <InlineMath math = "n + 1"/> terms where all of the terms 
                except for one are multiplied to 0 (excluding the bias). Every word in the corpus then gets a unique set of co-ordinates associated with it with length equal to <InlineMath math="d" />; words that are used often
                 together will have co-ordinates that are close together due to the training process. We can then apply cosine similarity in this space to measure how far apart two hashtags are. The closest
                  hashtags in the embedding space to the input hashtag are returned as the most similar words.<br />

                </p>

                <p className='subsubsection-paragraph'>
                    
                <figure className="flex-container-caption">
                <div className="flex-container"><img src={word2vec_arch} alt="Broken" className="image-medium"/></div>
                        <figcaption>This shows what the architecture could look like in something like the Twitter example.</figcaption>
                        </figure>
                
                </p>


                <p className='subsubsection-paragraph'>
                    Lastly, let's talk about the two types of word2vec models: CBOW & Skipgram. The CBOW model predicts a target word from a set of context words. In essence, it takes the 
                    "bag" of context words as input and tries to predict the word that is most likely to appear in the center of this context. This is akin to filling in a blank in a sentence where the
                     context is given. Mathematically, given a context window of words <InlineMath math="C = \{w_{1}, w_{2}, ..., w_{N}\}" />, CBOW aims to maximize the 
                     probability <InlineMath math="P(w_{target} | C)" />. The input layer is the average of the one-hot encoded vectors of the context words, which is then projected into the embedding space
                      by the hidden layer. The strength of CBOW lies in its efficiency. By averaging the context words, CBOW smooths over a lot of the noise and is faster to train, particularly for larger datasets.
                       This makes it well-suited for smaller and more cohesive datasets where word frequency is a strong indicator of context.

                </p>

                <p className="subsubsection-paragraph">
                    The Skip-gram model, on the other hand, flips the script. Given a target word, Skip-gram tries to predict the surrounding context words. For a target 
                    word <InlineMath math="w_{target}" />, Skip-gram maximizes the probability <InlineMath math="P(C | w_{target})" />, where <InlineMath math="C" /> is the set of context words. This 
                    model shines when dealing with larger contexts and less frequent words, as it treats each context-target pair as a new observation.
                </p>

                <p className="subsubsection-paragraph">
                    Skip-gram is inherently more complex than CBOW because it predicts multiple words for each input word. As a result, it tends to be slower to train but is excellent at capturing a wide 
                    range of semantic relationships, especially in large datasets with a diverse vocabulary.The choice between CBOW and Skip-gram should be guided by the specific requirements of your NLP task,
                     the size and nature of your dataset, and the computational resources at
                     your disposal. While CBOW is efficient and effective for common words and smaller datasets, Skip-gram excels in capturing nuanced semantic relationships across a larger vocabulary.
                </p>



                <h4>Doc2Vec Foundations</h4>
                <p className="subsubsection-paragraph">
                    Building on the foundation laid by Word2Vec, Doc2Vec (or Paragraph Vector) extends the embedding technique to entire documents or paragraphs, enabling us to capture the semantic
                     meaning of larger text structures. This model is particularly useful for tasks that require an understanding of document-level context, such as document similarity, document 
                     classification, and sentiment analysis.
                </p>

                <p className="subsubsection-paragraph">
                    Doc2Vec enhances the Word2Vec architecture by introducing a document token (also known as a paragraph ID) in addition to word vectors. This document token acts as a unique 
                    identifier for each document (or paragraph) in the dataset, allowing the model to learn a fixed-length vector representation for each document. Essentially, while Word2Vec learns
                     embeddings for words, Doc2Vec learns embeddings for both words and documents simultaneously.
                </p>

                <p className="subsubsection-paragraph">
                    There are two primary architectures of Doc2Vec: Distributed Memory (DM) and Distributed Bag of Words (DBOW). DM is analogous to the CBOW model of Word2Vec, where the model predicts a 
                    target word given its context words and the unique document ID. Conversely, DBOW is similar to the Skip-gram model, but instead of predicting context words from a target word, it 
                    predicts words randomly sampled from the document, given the document ID.
                </p>

                {/* <BlockMath math="\text{DM: } P(w_{target} | w_{context}, d) = \frac{\exp(\mathbf{v}_{w_{target}}^\top (\mathbf{h}_{w_{context}} + \mathbf{d}))}{\sum_{w=1}^{V} \exp(\mathbf{v}_w^\top (\mathbf{h}_{w_{context}} + \mathbf{d}))}" />

                <BlockMath math="\text{DBOW: } P(w_{sampled} | d) = \frac{\exp(\mathbf{v}_{w_{sampled}}^\top \mathbf{d})}{\sum_{w=1}^{V} \exp(\mathbf{v}_w^\top \mathbf{d})}" /> */}

                <p className="subsubsection-paragraph">
                    As an example, consider a corpus with documents related to technology, sports, and cooking. In the DM model, if our target word is "Apple" and it appears in a document about technology, 
                    the model uses the context words around "Apple" and the unique document ID to predict "Apple". The document ID helps the model understand that in this context, "Apple" is more likely 
                    related to technology than to cooking. As another example, in the DBOW model, for a document about soccer, the model might try to predict words like "goal", "ball", or "player" given only the document ID. This forces 
                    the model to learn document-specific embeddings that are informative enough to recall various words from the document, embedding the overall theme or topic of the document into the 
                    vector space.
                </p>

                <p className="subsubsection-paragraph">
                    The advantage of Doc2Vec lies in its ability to encapsulate the semantic essence of documents in a fixed-length vector, regardless of their length. This property makes it incredibly versatile
                     for downstream tasks. For instance, in sentiment analysis, Doc2Vec embeddings can serve as input features for a classifier to determine the sentiment expressed in movie reviews. 
                     Similarly, in document classification, the embeddings can help categorize articles into predefined topics based on their content.
                </p>

                <p className="subsubsection-paragraph">
                    Imagine a recommendation system for academic papers. By using Doc2Vec, we can embed each paper into the vector space and then find similar papers by measuring
                     the cosine similarity between their vectors. This enables the recommendation of papers that are contextually related, even if they don't share specific keywords.
                </p>

                {/* <p className="subsubsection-paragraph">
                    To sum up, Doc2Vec is a powerful extension of Word2Vec that allows for the embedding of larger text units, capturing their semantic meanings in a dense vector form. This facilitates 
                    a myriad of applications in NLP, making it an essential tool for tasks that require an understanding of document-level context.
                </p> */}


                <h4>Hyperparameters</h4>
                <p className="subsubsection-paragraph">

                    <ul>
                        <li>
                            <b>Vector Size (<InlineMath math="d" />)</b>: The dimensionality of the word or document vectors. Higher dimensions can capture more information but increase computational 
                            complexity.
                        </li>
                        <li>
                            <b>Window Size</b>: The maximum distance between a target word and words around the target word. Affects the context scope considered by the model.
                        </li>
                        <li>
                            <b>Min Count</b>: The minimum frequency count of words. Words with a frequency lower than this threshold will be ignored. This helps in removing rare words.
                        </li>
                        <li>
                            <b>Sampling Threshold</b>: The threshold for configuring which higher-frequency words are randomly downsampled. Useful for removing noise from common words.
                        </li>
                        <li>
                            <b>Negative Sampling</b>: The number of "noise words" to be drawn in negative sampling. Helps in learning the distribution of noise words.
                        </li>
                        <li>
                            <b>Epochs</b>: The number of iterations over the corpus. More epochs can lead to better model training at the cost of increased computational time.
                        </li>
                        <li>
                            <b>Learning Rate (<InlineMath math="\alpha" />)</b>: The step size at each iteration while moving toward a minimum of the loss function. Often starts high and decreases 
                            during training.
                        </li>
                    </ul>

                        <b>Specific to Doc2Vec:</b>

                    <ul>
                        <li>
                            <b>DM (Distributed Memory)</b>: Indicates whether the DM model of Doc2Vec is used. A value of 1 enables DM, while 0 switches to the DBOW model.
                        </li>
                        <li>
                            <b>DM Concatenation</b>: If using DM, this parameter determines whether to concatenate (1) or average (0) word vectors and the paragraph vector in the hidden layer. 
                            Concatenation can capture more complex patterns but increases the parameter space.
                        </li>
                        <li>
                            <b>DM Mean</b>: If using DM, this parameter decides whether to use the mean (1) or the sum (0) of the context word vectors. Affecting the training speed and quality of the
                             embeddings.
                        </li>
                    </ul>
                </p>

                <h4>In Code</h4>
                <p className="subsubsection-paragraph">
                    In Python, Word2Vec and Doc2Vec can be implemented using libraries like Gensim. An example might involve training a Word2Vec model on a corpus and then exploring the relationships
                     between word vectors.
                    <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

# Sample text data (for illustration purposes, replace with your dataset)
texts = [
    "Word2Vec is a technique to compute vector representations of words",
    "Doc2Vec is an extension of Word2Vec to compute vector representations of documents",
    "This is a simple example of Word2Vec and Doc2Vec models"
]

# Tokenize the text data
tokenized_texts = [nltk.word_tokenize(text.lower()) for text in texts]

# Word2Vec Training
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=2)
word2vec_model.train(tokenized_texts, total_examples=word2vec_model.corpus_count, epochs=10)

# Word2Vec Inference
print("Word2Vec Inference:")
word_vector = word2vec_model.wv['word2vec']  # Get vector for 'word2vec'
print(f"Vector for 'word2vec':{word_vector}")

# Prepare data for Doc2Vec
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(texts)]

# Doc2Vec Training
doc2vec_model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=2, epochs=10)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Doc2Vec Inference
print("Doc2Vec Inference:")
doc_vector = doc2vec_model.infer_vector(tokenized_texts[0])  # Infer vector for the first document
print(f"Vector for the first document:{doc_vector}")`}
        </SyntaxHighlighter>
    </p>
</section>


                    <section id="glove" className="code-cleaned">
                        <h2>GloVe</h2>
                        <p className="subsubsection-paragraph">
                            GloVe is an unsupervised learning algorithm for generating vector representations for words.
                        </p>

                        <p className="subsubsection-paragraph">
                            <table style={{ width: '100%', borderCollapse: 'collapse', margin: '10px 0' }}>
                                <tbody>
                                    <tr>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>Use Cases</td>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                            <span style={{ color: '#333399' }}>Text similarity</span>,
                                            <span style={{ color: '#008000' }}> Sentiment analysis</span>,
                                            <span style={{ color: '#ff4500' }}> Named entity recognition</span>,
                                            <span style={{ color: '#1e90ff' }}> Machine translation</span>
                                        </td>
                                    </tr>
                                    <tr>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>Python Libraries</td>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                            <span style={{ color: '#6a5acd' }}>Gensim (gensim. models. KeyedVectors. load_word2vec_format for pre-trained GloVe vectors)</span>,
                                            <span style={{ color: '#20b2aa' }}> spaCy (provides GloVe models integrated with spaCy's language models)</span>
                                        </td>
                                    </tr>
                                    <tr>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>O-Complexity (Worst Case)</td>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                            Dependent on corpus size and vocabulary, typically <span>O(v*c)</span>, where <i>v</i> is the vocabulary size and <i>c</i> is the context window size
                                        </td>
                                    </tr>
                                    <tr>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>Relevant Papers</td>
                                        <td style={{ padding: '8px', border: '1px solid #ddd' }}>
                                            <span>"GloVe: Global Vectors for Word Representation"</span> by Pennington, Socher, and Manning, 2014
                                        </td>
                                    </tr>
                                </tbody>
                            </table>
                        </p>


                        <h4>GloVe Foundations</h4>
                        <p className="subsubsection-paragraph">
                            In word2vec, our approach was self-supervised -- this means that we considered only a particular document (e.g. post, sentence, etc.) to make the pairs (in the skipgram) case 
                            but didn't really care about the overall global relation i.e. how, across all documents, words were used together. In GloVe, we attempt to do this by considering something 
                            called a co-occurrence matrix. This matrix captures how frequently pairs of words appear together in a certain context within the entire corpus, providing a global picture of word
                             relationships. GloVe then seeks to learn word embeddings in such a way that the dot product of two word vectors captures the logarithm of their co-occurrence probability, 
                             adjusted by certain biases for each word -- this will be made clear when we look at the cost function but let's first consider how the co-occurence matrix is constructed. 
                        </p>

                        
                        <p className="subsubsection-paragraph">
                        Let's consider a simple example with two short documents to illustrate how a co-occurrence matrix might be constructed. For this example, let's use the following two documents:
                        <ul>
                            <li><strong>Document 1: </strong> "apple banana"</li>
                            <li><strong>Document 2: </strong> "banana orange apple"</li>
                            </ul>
                            
                            

                            Assuming we use a window size of 1 (which means we consider one word to the left and one word to the right of each target word as its context), we can construct the 
                            co-occurrence matrix by first observing that the unique words across these documents form our vocabulary: ["apple", "banana", "orange"]. For each word in the vocabulary, we 
                            count how many times each other word occurs in its context across all documents. The co-occurrence matrix, with rows and columns representing the vocabulary words, might look
                             like this:

                             <p class='centered-table'>
                             <table>
                                    <tr>
                                        <th></th>
                                        <th>apple</th>
                                        <th>banana</th>
                                        <th>orange</th>
                                    </tr>
                                    <tr>
                                        <td>apple</td>
                                        <td>0</td>
                                        <td>2</td>
                                        <td>1</td>
                                    </tr>
                                    <tr>
                                        <td>banana</td>
                                        <td>2</td>
                                        <td>0</td>
                                        <td>1</td>
                                    </tr>
                                    <tr>
                                        <td>orange</td>
                                        <td>1</td>
                                        <td>1</td>
                                        <td>0</td>
                                    </tr>
                                    </table>
                            </p>


                             <ul>
                             <li><strong>apple: </strong>
                                Co-occurs with "banana" 2 times: once in Document 1 and once in Document 2.
                                Co-occurs with "orange" 1 time: in Document 2.</li>
                               <li> <strong>banana: </strong>
                                Co-occurs with "apple" 2 times: once in Document 1 and once in Document 2.
                                Co-occurs with "orange" 1 time: in Document 2.</li>
                                <li><strong>orange: </strong>
                                Co-occurs with "apple" 1 time: in Document 2.
                                Co-occurs with "banana" 1 time: in Document 2.</li>
                             </ul>

                             One of the key benefits of utilizing a co-occurrence matrix, as seen in models like GloVe, is the ability to discern the relative probabilities that indicate the relevance of 
                             certain words to a given target word. This approach enables us to quantify the strength of association between words based on their contextual relationships. Consider, for instance, the
                              comparative analysis of two probabilities: the likelihood of the word "solid" appearing in conjunction with "ice," versus "solid" appearing with "steam." Intuitively, we anticipate a higher
                               probability for the combination of "solid" and "ice" than for "solid" and "steam." Hence, if we were to compute the ratio 
                               of <InlineMath math="P(\text{'solid'} | \text{'ice'})" /> to <InlineMath math="P(\text{'solid'} | \text{'steam'})" />, we would expect a value exceeding 1, given
                                that <InlineMath math="P(\text{'solid'} | \text{'ice'})" /> is positioned in the numerator.

                        </p>

                        <p className="subsubsection-paragraph">
                            These probabilities can be directly derived from the co-occurrence matrix. To calculate the probability of word A appearing in the context of word B, you can just
                              identify the frequency of A's occurrence with B, divided by the total occurrences of B (summing across the corresponding row in the matrix). For example,
                              the probability of encountering "banana" in the context of "apple" is computed as <InlineMath math="\frac{2}{2 + 1} = \frac{2}{3} \approx 0.66" />, reflecting the contextual
                               affinity between these words.
                        </p>

                        <p className="subsubsection-paragraph">
                            With the foundational understanding of calculating relative probabilities from the co-occurrence matrix, the subsequent phase involves harnessing these probabilities
                             to inform the learning of word embeddings in the GloVe model. The core objective is to refine word vectors in such a manner that the geometric relationships 
                             (specifically, the dot product) between any two word vectors accurately reflect the log of their co-occurrence probabilities. In other words, you want the multiplication of the two word vectors you're looking at to be in line with what we see between them 
                              in the co-occurence matrix and for that, we need a cost function: 

                              <BlockMath math="J = \sum_{i,j=1}^{V} f(X_{ij}) \left( w_i^T \tilde{w}_j + b_i + \tilde{b}_j - \log X_{ij} \right)^2" />
                            where <InlineMath math="f(X_{ij})" /> is a weighting function designed to prevent overemphasis on rare or frequent co-occurrences, 
                            and <InlineMath math="w_i" />, <InlineMath math="\tilde{w}_j" />, <InlineMath math="b_i" />, and <InlineMath math="\tilde{b}_j" /> are the word vectors and biases
                             for words <InlineMath math="i" /> and <InlineMath math="j" />, respectively, <InlineMath math="V" /> is the vocabulary size, and the summation extends over all pairs <InlineMath math="(i, j)" /> with nonzero co-occurrence 
                            counts <InlineMath math="X_{ij}" />. This optimization process iteratively updates the word vectors to reduce the discrepancy between
                              the predicted and actual log probabilities, thereby refining the embeddings.
                        </p>

                        <p className="subsubsection-paragraph">
                            The GloVe model undertakes the optimization of all word vectors and biases simultaneously across the entire vocabulary. This comprehensive approach ensures that the 
                            embeddings capture the global word-word co-occurrence statistics reflected in the co-occurrence matrix.
                        </p>

                        <p className="subsubsection-paragraph">
                            During training, the model iteratively updates the word vectors <InlineMath math="w_i" /> and context word vectors <InlineMath math="\tilde{w}_j" />, along with their
                             corresponding biases <InlineMath math="b_i" /> and <InlineMath math="\tilde{b}_j" />, for every word in the vocabulary. The objective is to minimize the cost function seen above.
                        </p>

                        <p className="subsubsection-paragraph">
                            The optimization process involves the following steps:
                            <ol>
                                <li><b>Initialization:</b> Word vectors and biases are initialized with small random values.</li>
                                <li><b>Gradient Computation:</b> For each pair <InlineMath math="(i, j)" /> with <InlineMath math="X_{ij} > 0" />, the gradient of the cost function <InlineMath math="J" /> with 
                                respect to each parameter is computed to understand how a small change in the parameter affects the cost.</li>
                                <li><b>Parameter Update:</b> The parameters are updated by moving against the gradient direction, scaled by a learning rate <InlineMath math="\eta" />. This is typically done 
                                using an optimization algorithm like stochastic gradient descent. The updates are applied across all parameters based on their gradients.</li>
                                <li><b>Iteration:</b> The above steps are repeated across multiple iterations until the cost function converges to a minimum, indicating the optimal values for the word
                                 vectors and biases.</li>
                            </ol>
                        </p>

                        <p className="subsubsection-paragraph">
                            Essentially, at each step the model calculates the current discrepancy between the dot product of their vectors (plus biases) and the actual logarithm of their co-occurrence count. 
                            This discrepancy represents the error or how far off the model's current guess is from the true statistical relationship. The model then uses this discrepancy to adjust the word 
                            vectors and biases. If the dot product of two word vectors is too low compared to the logarithm of their co-occurrence count, the model adjusts the vectors to make them more 
                            similar, thereby increasing their dot product. Conversely, if the dot product is too high, the model adjusts the vectors to decrease their similarity. This process is repeated for
                             all pairs of words across the corpus, iteratively refining the word vectors and biases to reduce the overall discrepancy measured by the cost function.
                        </p>

                        <p className = "subsubsection-paragraph">
                        The ultimate goal of this update process is to reach a point where the word vectors and biases reflect the true semantic and syntactic relationships between words, as evidenced by how 
                        often they co-occur in the corpus. When the cost function is minimized, it indicates that the word vectors are well-aligned with the actual patterns of word usage in the language,
                         capturing meaningful aspects of word meaning and usage.
                        </p>


                        <h4>Hyperparameters</h4>
                        <p className="subsubsection-paragraph">
                        <ul>
                            <li>
                                <b>Vector Size (<InlineMath math="d" />)</b>: Determines the dimensionality of the word vectors. Higher dimensions can capture more nuanced semantic relationships but increase computational complexity.
                            </li>
                            <li>
                                <b>Window Size</b>: The size of the context around each word considered when constructing the co-occurrence matrix. Affects the scope of word relationships captured by the model.
                            </li>
                            <li>
                                <b>Minimum Co-occurrence Count</b>: The minimum number of times a word pair must co-occur to be included in the co-occurrence matrix. Helps to filter out rare and potentially noisy co-occurrences.
                            </li>
                            <li>
                                <b>X<sub>max</sub> (<InlineMath math="X_{\text{max}}" />)</b>: A cutoff parameter for the weighting function <InlineMath math="f(X_{ij})" />. Co-occurrence counts above this value are given less weight to prevent overly frequent word pairs from dominating the training process.
                            </li>
                            <li>
                                <b>Alpha (<InlineMath math="\alpha" />)</b>: The exponent used in the weighting function <InlineMath math="f(X_{ij})" />. Controls how quickly the weighting function grows and levels off for higher co-occurrence counts.
                            </li>
                            <li>
                                <b>Learning Rate (<InlineMath math="\eta" />)</b>: The step size used in the optimization algorithm during training. Influences the speed and stability of convergence towards the optimal solution.
                            </li>
                            <li>
                                <b>Number of Iterations</b>: The total number of passes over the entire co-occurrence matrix during training. Affects the extent to which the model learns from the corpus, with more iterations potentially leading to better embeddings at the cost of longer training time.
                            </li>
                        </ul>
                        </p>

                        <h4>In Code</h4>
                        <p className="subsubsection-paragraph">
                            Implementing GloVe from scratch is complex but using pre-trained GloVe vectors in Python is straightforward. Here’s an example using the Gensim library to load 
                            pre-trained GloVe vectors:
                            <SyntaxHighlighter language="python" style={docco} className="codeStyle_small">
            {`import requests
import os
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

# URL of the GloVe file to download
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_file = "glove.6B.zip"
glove_file = "glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec"

# Download GloVe vectors
if not os.path.exists(glove_zip_file):
    print("Downloading GloVe vectors...")
    response = requests.get(glove_url)
    with open(glove_zip_file, "wb") as f:
        f.write(response.content)

# Unzip GloVe file (you might need to use a specific library like zipfile or tarfile based on your environment)
# For example:
# import zipfile
# with zipfile.ZipFile(glove_zip_file, "r") as zip_ref:
#     zip_ref.extractall()

# Convert the GloVe file format to Word2Vec
glove2word2vec(glove_file, word2vec_output_file)

# Load the converted vectors
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# Explore the model
print(model['computer'])  # Output the vector for 'computer'
print(model.most_similar('computer'))  # Find similar words`}
                        </SyntaxHighlighter>
                    </p>
                </section>

                
                
                <div className="subsubsection-navigation">
                    <Link to="/ml/mlalgo">← Foundational ML Models</Link>
                    <Link to="/ml/cnn">Convolutional Neural Networks →</Link>
                </div>
            </main>
            
            <Footer />
        </div>
    );
}

export default ClassicML;
