from embedutils import Model
from pprint import pprint

model = Model('dbmdz/bert-base-italian-xxl-cased', True)

print(f"Il tokenizzatore ha un vocabolario di {model.vocab_size} token che riconosce.")
print(f"Ogni token viene mappato ('embedded') in un vettore di {model.embedding_dim} numeri.")
print(f"La matrice degli embeddings ha perciò dimensione {model.vocab_embeddings.shape}")

Il tokenizzatore ha un vocabolario di 31102 token che riconosce.
Ogni token viene mappato ('embedded') in un vettore di 768 numeri.
La matrice degli embeddings ha perciò dimensione (32102, 768)

token = "bellezza"
token_id = model.token_to_id(token)
print(f"Dato un token, come '{token}', il tokenizzatore è in grado di trovarne l'identificatore: {token_id}")
print(f"(ai token non presenti nel vocabolario è associato l'identificatore {model.tokenizer.convert_tokens_to_ids(model.tokenizer.unk_token)}).")

Dato un token, come 'bellezza', il tokenizzatore è in grado di trovarne l'identificatore: 6108
(ai token non presenti nel vocabolario è associato l'identificatore 101).

embedding = model.token_to_embedding(token)
print(f"Il token '{token}' è mappato in un vettore di {len(embedding)} elementi e i cui primi 5 elementi sono:\n{embedding[:5]}")

Il token 'bellezza' è mappato in un vettore di 768 elementi e i cui primi 5 elementi sono:
[ 0.03051873  0.01173639 -0.04997671  0.0277972   0.02349026]

pprint(model.most_similar(token, top_n=5))

[('bellezze', 0.45),
 ('dolcezza', 0.38),
 ('splendore', 0.38),
 ('estetica', 0.35),
 ('fascino', 0.34)]

positive_examples = ["re", "donna"]             # sovrano donna
negative_examples = ["uomo"]
print(model.most_similar(positive_examples, negative_examples))

[('regina', 0.31)]

positive_examples = ["Roma", "Francia"]        # capitale della Francia
negative_examples = ["Italia"]
print(model.most_similar(positive_examples, negative_examples))

[('Parigi', 0.48)]

positive_examples = ["Italia", "Catalogna"]     # stato a cui appartiene la Catalogna
negative_examples = ["Lombardia"]
print(model.most_similar(positive_examples, negative_examples))

[('Spagna', 0.39)]

positive_examples = ["Garibaldi", "Francia"]     # eroe nazionale francese
negative_examples = ["Italia"]
print(model.most_similar(positive_examples, negative_examples))

[('Bonaparte', 0.44)]

positive_examples = ["estate", "freddo"]        # stagione fredda
negative_examples = ["caldo"]
print(model.most_similar(positive_examples, negative_examples))

[('inverno', 0.5)]

positive_examples = ["chitarra", "pianista"]    # strumento del pianista
negative_examples = ["chitarrista"]
print(model.most_similar(positive_examples, negative_examples))

[('pianoforte', 0.63)]

positive_examples = ["chitarra", "pianista"]    # strumento del pianista
negative_examples = ["chitarrista"]
print(model.most_similar(positive_examples, negative_examples))

[('pianoforte', 0.63)]

positive_examples = ["nuoto", "palestra"]       # sport praticato in palestra
negative_examples = ["piscina"]
print(model.most_similar(positive_examples, negative_examples))

[('ginnastica', 0.4)]

positive_examples = ["padre", "figlia"]         # genitore femmina
negative_examples = ["figlio"]
print(model.most_similar(positive_examples, negative_examples))

[('madre', 0.56)]

positive_examples = ["bello", "cattivo"]        # opposto di "cattivo"
negative_examples = ["brutto"]
print(model.most_similar(positive_examples, negative_examples))

[('buono', 0.37)]

positive_examples = ["bianca", "nero"]          # femminile di "nero"
negative_examples = ["bianco"]
print(model.most_similar(positive_examples, negative_examples))

[('nera', 0.56)]

positive_examples = ["treno", "automobili"]     # singolare di "automobili"
negative_examples = ["treni"]
print(model.most_similar(positive_examples, negative_examples))

[('automobile', 0.54)]

positive_examples = ["andare", "guardato"]      # infinito di "guardato"
negative_examples = ["andato"]
print(model.most_similar(positive_examples, negative_examples))

[('guardare', 0.57)]

positive_examples = ["pensando", "ascoltare"]   # gerundio di "ascoltare"
negative_examples = ["pensare"]
print(model.most_similar(positive_examples, negative_examples))

[('ascoltando', 0.66)]

positive_examples = ["attore", "donna"]         # femminile di "attore"
negative_examples = ["uomo"]
print(model.most_similar(positive_examples, negative_examples))

[('attrice', 0.63)]

Un'esplorazione del token embedding con un transformer¶