from whatlies import EmbeddingSet
from whatlies.language import SpacyLanguage

lang = SpacyLanguage("en_core_web_md")
words = ["cat", "dog", "fish", "kitten", "man", "woman",
         "king", "queen", "doctor", "nurse"]

emb = EmbeddingSet(*[lang[w] for w in words])
emb.plot_interactive(x_axis=emb["man"], y_axis=emb["woman"])

look at this https://github.com/koaning/whatlies/

!python -m spacy download en_core_web_md

import numpy as np
from whatlies.language import BytePairLanguage
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ("embed", BytePairLanguage("en")),
    ("model", LogisticRegression())
])

X = [
    "i really like this post",
    "thanks for that comment",
    "i enjoy this friendly forum",
    "this is a bad post",
    "i dislike this article",
    "this is not well written"
]

y = np.array([1, 1, 1, 0, 0, 0])

pipe.fit(X, y)
Pipeline(steps=[('embed', BytePairLanguage(lang='en')),
                ('model', LogisticRegression())])
X , y
(['i really like this post',
  'thanks for that comment',
  'i enjoy this friendly forum',
  'this is a bad post',
  'i dislike this article',
  'this is not well written'],
 array([1, 1, 1, 0, 0, 0]))
!pwd
/home/manoj/chatgpt/searchexp
import pandas as pd

df = pd.read_csv("../cluestar/tesco_support.csv").loc[lambda d: ~d['text'].str.contains("https")]
texts = list(df['text'].sample(2000))
texts[:3]
['@Tesco your groceries app and mobile site refuse to allow me to log in it’s really frustrating any plans for implementing a fix?',
 '@Tesco do you have shops in France?',
 'Worst Tesco experience today. No stock of all the essentials, long lines, not enough tills open. @Tesco']
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD())

X = pipe.fit_transform(texts)
X
array([[ 0.19568161, -0.00381794],
       [ 0.16365132,  0.19927045],
       [ 0.15350373, -0.00687524],
       ...,
       [ 0.14391981, -0.02399666],
       [ 0.19659102,  0.1060059 ],
       [ 0.16526766,  0.0234143 ]])
from cluestar import plot_text

plot_text(X, texts)

https://github.com/koaning/cluestar/blob/main/notebooks/overview.ipynb

import umap

pipe = make_pipeline(TfidfVectorizer(),
                     umap.UMAP())

X = pipe.fit_transform(texts)

plot_text(X, texts)