For 2018 to 2021, I started to teach AI on NLP and text mining for China's learners.
These are source codes for the previous lessons.
我有个数据库读取保存数据的性能问题要请教下:
疑问A:
疑问B:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@IDE :PyCharm
@Author :LuckyHuibo
@Date :2019/8/20 20:03
@Desc :连接数据库,读取数据——分开写,速度很快的代码
=================================================='''
import pymysql
import re
import pysnooper
def clean(s):
"""
清洗数据
:param s: 文本
:return:
"""
re_compile = re.compile(r'�|《|》|\/|)|(|【|】|\\n|\\r|\\t|\\u3000|;|\*')
string = re_compile.sub('', str(s))
return string
# 从数据库中得到新闻语料库
@pysnooper.snoop()
def get_news_from_sql(host, user, password, database, port):
print('开始连接数据库...')
db = pymysql.connect(host, user, password, database, port, charset='utf8') # 不添加charset,读取到的数据是乱码
print(db)
print('连接成功...')
cursor = db.cursor()
sql = """SELECT content from news_chinese"""
try:
cursor.execute(sql)
except Exception as e:
# 如果发生异常,则回滚
print("发生异常", e)
db.rollback()
return
news = cursor.fetchall()
print(len(news))
cursor.close()
db.close()
return news
# 同样的代码,save_txt的代码写到get_news_from_sql的最后面,保存文本慢得要死,一行一行地读取数据
# 将代码分开写成函数,速度一下子提升上万倍,一下子就保存好了
def save_txt(news):
try:
with open('../data/news-sentences-xut2.txt', 'w', encoding='utf-8') as f:
for content in news:
data = content[0]
text = clean(data)
f.write(text + '\n')
except Exception as w:
print('保存数据到文本出现问题', w)
if __name__ == "__main__":
host = "rm-8vbwj6507z6465505ro.mysql.zhangbei.rds.aliyuncs.com"
user = "root"
password = "AI@2019@ai"
database = "stu_db"
port = 3306
try:
contents = get_news_from_sql(host, user, password, database, port)
save_txt(contents)
except Exception:
# 如果发生异常,则回滚
print("ERROR", Exception)
# db.rollback()
pass
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@IDE :PyCharm
@Author :LuckyHuibo
@Date :2019/8/20 20:03
@Desc :连接数据库,读取数据
【问题】我有个数据库读取保存数据的性能问题要请教下:
# 同样的代码,save_txt的代码写到get_news_from_sql的最后面,保存文本慢得要死,一行一行地读取数据
# 将代码分开写成函数,速度一下子提升上万倍,一下子就保存好了
=================================================='''
import pymysql
import re
import pysnooper
def clean(s):
"""
清洗数据
:param s: 文本
:return:
"""
re_compile = re.compile(r'�|《|》|\/|)|(|【|】|\\n|\\r|\\t|\\u3000|;|\*')
string = re_compile.sub('', str(s))
return string
# 从数据库中得到新闻语料库
@pysnooper.snoop()
def get_news_from_sql(host, user, password, database, port):
print('开始连接数据库...')
db = pymysql.connect(host, user, password, database, port, charset='utf8') # 不添加charset,读取到的数据是乱码
print(db)
print('连接成功...')
cursor = db.cursor()
sql = """SELECT content from news_chinese"""
try:
cursor.execute(sql)
except Exception as e:
# 如果发生异常,则回滚
print("发生异常", e)
db.rollback()
return
news = cursor.fetchall()
print(len(news))
cursor.close()
db.close()
# return news
# 同样的代码,save_txt的代码写到get_news_from_sql的最后面,保存文本慢得要死,一行一行地读取数据
# 将代码分开写成函数,速度一下子提升上万倍,一下子就保存好了
# def save_txt(news):
try:
with open('../data/news-sentences-xut.txt', 'w', encoding='utf-8') as f:
for content in news:
data = content[0]
text = clean(data)
f.write(text + '\n')
except Exception as w:
print('保存数据到文本出现问题', w)
if __name__ == "__main__":
host = "rm-8vbwj6507z6465505ro.mysql.zhangbei.rds.aliyuncs.com"
user = "root"
password = "AI@2019@ai"
database = "stu_db"
port = 3306
try:
contents = get_news_from_sql(host, user, password, database, port)
# save_txt(contents)
except Exception:
# 如果发生异常,则回滚
print("ERROR", Exception)
# db.rollback()
pass
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim import models
# 从config配置中读取path_news_txt(保存读取的news_chinese表的数据), path_news_model(保存的model的路径)文件路径
from config.file_path import path_news_txt, path_news_model
if __name__ == "__main__":
# 对读取的数据库news进行训练
news_vec = Word2Vec(LineSentence(path_news_txt), size=100, min_count=1, workers=8)
# 将训练结果保存为model
news_vec.save(path_news_model)
# 加载news_model,进行数据的测试
model = models.Word2Vec.load(path_news_model)
# 查找model中跟“说”相关的词
said = model.most_similar('说')
'''执行后报错,说训练的model中没有“说”这个词,但是数据库中有【说】字,且min_count=1了
File "C:\Python36\lib\site-packages\gensim\models\keyedvectors.py", line 464, in word_vec
raise KeyError("word '%s' not in vocabulary" % word)
KeyError: "word '说' not in vocabulary"
'''
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.