注意:此代码是用 python 3.6.1 (+gensim 2.3.0) 编写的
doc2vec与gensim的python实现及应用
import re import numpy as np from gensim.models import doc2vec from gensim.models.doc2vec import taggeddocument from nltk.corpus import gutenberg from multiprocessing import pool from scipy import spatial
sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list print('type of corpus: ', type(sentences)) print('length of corpus: ', len(sentences))
语料库类型:类“list”
语料库长度:3106
print(sentences[0]) # title, author, and year print(sentences[1]) print(sentences[10])
['[', 'the', '悲剧', 'of', '哈姆雷特', 'by', '威廉', '莎士比亚', '1599', ']']
['actus', 'primus', '.']
['弗兰', '.']
预处理数据
for i in range(len(sentences)): sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-za-z]+', word)] print(sentences[0]) # title, author, and year print(sentences[1]) print(sentences[10])
['the'、'悲剧'、'of'、'哈姆雷特'、'by'、'威廉'、'莎士比亚']
['actus', 'primus']
['弗兰']
for i in range(len(sentences)): sentences[i] = taggeddocument(words = sentences[i], tags = ['sent{}'.format(i)]) # converting each sentence into a taggeddocument sentences[0]
taggeddocument(words=['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare'], tags=['sent0'])
model = doc2vec(documents = sentences,dm = 1, size = 100, min_count = 1, iter = 10, workers = pool()._processes) model.init_sims(replace = true)
model.save('doc2vec_model') model = doc2vec.load('doc2vec_model')
model.most_similar('hamlet')
[('horatio', 0.9978846311569214),
('女王', 0.9971947073936462),
('莱尔特斯', 0.9971820116043091),
('国王', 0.9968599081039429),
('妈妈', 0.9966716170310974),
('哪里', 0.9966292381286621),
('迪尔', 0.9965540170669556),
('奥菲莉亚', 0.9964221715927124),
('非常', 0.9963752627372742),
('哦', 0.9963476657867432)]
v1 = model['king'] v2 = model['queen'] # define a function that computes cosine similarity between two words def cosine_similarity(v1, v2): return 1 - spatial.distance.cosine(v1, v2) cosine_similarity(v1, v2)
0.99437165260314941
立即学习“Python免费学习笔记(深入)”;
以上就是使用 Python 进行词嵌入:docc的详细内容,更多请关注中国大学网其它相关文章!