import re
import time
import json
maoyan_url_base = 'https://maoyan.com/board/4?offset='
pattern = re.compile('<dd>.*?<i class="board-index.*?>(.*?)</i>.*?title="(.*?)".*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>',re.S)
proxies = {
'http':'http://127.0.0.1:10809',
'https':'http://127.0.0.1:10809'
}
def get_one_page_url(url):
"""获得一个网页的源码,使用代理"""
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
result = requests.get(url,headers=headers,proxies=proxies)
if result.status_code == 200:
return result.text
else:
return None
def store_html(html_txt,filename):
"""存储一个网页的源码"""
with open(filename,'w',encoding='utf-8') as f:
f.write(html_txt)
def get_store_html(filename):
"""获取文件的内容"""
with open(filename,'r',encoding='utf-8') as f:
html = f.read()
return html
def store_10_html():
"""存储十个网页的源码"""
for i in range(1,2):
url = maoyan_url_base + str(i*10)
filename = f"maoyan/maoyan_page{i}.txt"
html = get_one_page_url(url)
store_html(html,filename)
time.sleep(1)
def scrap_web(filename):
"""处理源代码的排名,电影名等,返回字典格式"""
html = get_store_html(filename)
results = re.findall(pattern,html)
# 1为排名,2为电影名,3为主演名
for result in results:
# print(result.group(),result.group(2),result.group(3).strip(),result.group(4))
score =result[4]+result[5]
score = score.strip()
print(result[0],result[1],result[2].strip(),result[3],score)
# yield{
# 'index':result[0],
# 'title':result[1],
# 'actor':result[2].strip()[3:],
# 'time':result[3][5:],
# 'score':score
# }
# def scrap_10_webs():
# """获取十个网站的源码"""
# for i in range(10):
# filename = f"maoyan/maoyan_page{i}.txt"
# content = scrap_web(filename)
# scrap_web(filename)
def write_to_json(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def read_10_txts():
for i in range(10):
filename = f"maoyan/maoyan_page{i}.txt"
for item in scrap_web(filename):
write_to_json(item)
if __name__ == "__main__":
scrap_web('maoyan/maoyan_page2.txt')
``
原代码里的第25条是和别的代码一样的格式,就是爬不出来 有没有遇到同样问题的大佬呢?
网页源代码
` <dd>
<i class="board-index board-index-25">25</i>
<a href="/films/267" title="泰坦尼克号" class="image-link" data-act="boarditem-click" data-val="{movieId:267}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
<img data-src="https://p0.meituan.net/moviemachine/e7dd6b1f77fba08c1f20a3b20b156621642576.jpg@160w_220h_1e_1c" alt="泰坦尼克号" class="board-img" />
</a>
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/267" title="泰坦尼克号" data-act="boarditem-click" data-val="{movieId:267}">泰坦尼克号</a></p>
<p class="star">
主演:莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩
</p>
<p class="releasetime">上映时间:1998-04-03</p> </div>
<div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">4</i></p>
</div>
</div>
</div>
</dd>
<dd>
<i class="board-index board-index-26">26</i>
<a href="/films/899" title="当幸福来敲门" class="image-link" data-act="boarditem-click" data-val="{movieId:899}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
<img data-src="https://p0.meituan.net/moviemachine/e5daa8748733820faab91102bd0bc4507730353.jpg@160w_220h_1e_1c" alt="当幸福来敲门" class="board-img" />
</a>
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/899" title="当幸福来敲门" data-act="boarditem-click" data-val="{movieId:899}">当幸福来敲门</a></p>
<p class="star">
主演:威尔·史密斯,贾登·史密斯,坦迪·牛顿
</p>
<p class="releasetime">上映时间:2008-01-17</p> </div>
<div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">3</i></p>
</div>
</div>
</div>
</dd>`