The jupyters_and_slides's discuss from computing-intelligence

【Word2Vec】训练的model没有“说”字，报KeyError: "word '说' not in vocabulary"

根据数据库训练出来的model，找不到跟说相关的词，报KeyError: "word '说' not in vocabulary"

min_count=1 已经设置为1了

path_news_txt（保存读取的news_chinese表的数据）


from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim import models
# 从config配置中读取path_news_txt（保存读取的news_chinese表的数据）, path_news_model（保存的model的路径）文件路径
from config.file_path import path_news_txt, path_news_model

if __name__ == "__main__":
    # 对读取的数据库news进行训练
    news_vec = Word2Vec(LineSentence(path_news_txt), size=100, min_count=1, workers=8)
    # 将训练结果保存为model
    news_vec.save(path_news_model)

    # 加载news_model，进行数据的测试
    model = models.Word2Vec.load(path_news_model)
    # 查找model中跟“说”相关的词
    said = model.most_similar('说')

    '''执行后报错，说训练的model中没有“说”这个词，但是数据库中有【说】字，且min_count=1了
    
    File "C:\Python36\lib\site-packages\gensim\models\keyedvectors.py", line 464, in word_vec
    raise KeyError("word '%s' not in vocabulary" % word)
    KeyError: "word '说' not in vocabulary"  
      
    '''

【性能问题】数据库读取后保存数据到txt的速度问题

我有个数据库读取保存数据的性能问题要请教下：
疑问A：

同样的代码，save_txt的代码写到get_news_from_sql的最后面，保存文本慢得要死，一行一行地读取数据
将代码分开写成函数，速度一下子提升上万倍，一下子就保存好了

疑问B：

怎么排查，调试这种问题呢？

下面快的代码和慢的代码的主要区别：

下面是写在一起，速度很快的代码

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@IDE    ：PyCharm
@Author ：LuckyHuibo
@Date   ：2019/8/20 20:03
@Desc   ：连接数据库，读取数据——分开写，速度很快的代码
=================================================='''
import pymysql
import re
import pysnooper


def clean(s):
    """
    清洗数据
    :param s: 文本
    :return:
    """
    re_compile = re.compile(r'�|《|》|\/|）|（|【|】|\\n|\\r|\\t|\\u3000|;|\*')
    string = re_compile.sub('', str(s))
    return string


# 从数据库中得到新闻语料库
@pysnooper.snoop()
def get_news_from_sql(host, user, password, database, port):
    print('开始连接数据库...')
    db = pymysql.connect(host, user, password, database, port, charset='utf8')  # 不添加charset，读取到的数据是乱码
    print(db)
    print('连接成功...')

    cursor = db.cursor()
    sql = """SELECT content from news_chinese"""
    try:
        cursor.execute(sql)
    except Exception as e:
        # 如果发生异常，则回滚
        print("发生异常", e)
        db.rollback()
        return

    news = cursor.fetchall()
    print(len(news))
    cursor.close()
    db.close()

    return news

    # 同样的代码，save_txt的代码写到get_news_from_sql的最后面，保存文本慢得要死，一行一行地读取数据
    # 将代码分开写成函数，速度一下子提升上万倍，一下子就保存好了

def save_txt(news):
    try:
        with open('../data/news-sentences-xut2.txt', 'w', encoding='utf-8') as f:
            for content in news:
                data = content[0]
                text = clean(data)
                f.write(text + '\n')
    except Exception as w:
        print('保存数据到文本出现问题', w)


if __name__ == "__main__":
    host = "rm-8vbwj6507z6465505ro.mysql.zhangbei.rds.aliyuncs.com"
    user = "root"
    password = "AI@2019@ai"
    database = "stu_db"
    port = 3306
    try:
        contents = get_news_from_sql(host, user, password, database, port)
        save_txt(contents)
    except Exception:
        # 如果发生异常，则回滚
        print("ERROR", Exception)
        # db.rollback()
        pass

下面是写在一起，速度很慢的代码

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@IDE    ：PyCharm
@Author ：LuckyHuibo
@Date   ：2019/8/20 20:03
@Desc   ：连接数据库，读取数据

【问题】我有个数据库读取保存数据的性能问题要请教下：
# 同样的代码，save_txt的代码写到get_news_from_sql的最后面，保存文本慢得要死，一行一行地读取数据
# 将代码分开写成函数，速度一下子提升上万倍，一下子就保存好了
=================================================='''
import pymysql
import re
import pysnooper


def clean(s):
    """
    清洗数据
    :param s: 文本
    :return:
    """
    re_compile = re.compile(r'�|《|》|\/|）|（|【|】|\\n|\\r|\\t|\\u3000|;|\*')
    string = re_compile.sub('', str(s))
    return string


# 从数据库中得到新闻语料库
@pysnooper.snoop()
def get_news_from_sql(host, user, password, database, port):
    print('开始连接数据库...')
    db = pymysql.connect(host, user, password, database, port, charset='utf8')  # 不添加charset，读取到的数据是乱码
    print(db)
    print('连接成功...')

    cursor = db.cursor()
    sql = """SELECT content from news_chinese"""
    try:
        cursor.execute(sql)
    except Exception as e:
        # 如果发生异常，则回滚
        print("发生异常", e)
        db.rollback()
        return

    news = cursor.fetchall()
    print(len(news))
    cursor.close()
    db.close()

    # return news

    # 同样的代码，save_txt的代码写到get_news_from_sql的最后面，保存文本慢得要死，一行一行地读取数据
    # 将代码分开写成函数，速度一下子提升上万倍，一下子就保存好了

    # def save_txt(news):
    try:
        with open('../data/news-sentences-xut.txt', 'w', encoding='utf-8') as f:
            for content in news:
                data = content[0]
                text = clean(data)
                f.write(text + '\n')
    except Exception as w:
        print('保存数据到文本出现问题', w)


if __name__ == "__main__":
    host = "rm-8vbwj6507z6465505ro.mysql.zhangbei.rds.aliyuncs.com"
    user = "root"
    password = "AI@2019@ai"
    database = "stu_db"
    port = 3306
    try:
        contents = get_news_from_sql(host, user, password, database, port)
        # save_txt(contents)
    except Exception:
        # 如果发生异常，则回滚
        print("ERROR", Exception)
        # db.rollback()
        pass

computing-intelligence / jupyters_and_slides Goto Github PK

jupyters_and_slides's Issues

【Word2Vec】训练的model没有“说”字，报KeyError: "word '说' not in vocabulary"

根据数据库训练出来的model，找不到跟说相关的词，报KeyError: "word '说' not in vocabulary"

min_count=1 已经设置为1了

path_news_txt（保存读取的news_chinese表的数据）

【性能问题】数据库读取后保存数据到txt的速度问题

下面是写在一起，速度很快的代码

下面是写在一起，速度很慢的代码

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent