MaoYan icon indicating copy to clipboard operation
MaoYan copied to clipboard

为何只有排名25的泰坦尼克号爬不出来呢

Open BroMachee opened this issue 4 years ago • 0 comments

我是先把源代码放在文件夹里,然后从本地爬取,唯独第25的泰坦尼克号没有爬出来,是为什么

import re
import time
import json
maoyan_url_base = 'https://maoyan.com/board/4?offset='
pattern = re.compile('<dd>.*?<i class="board-index.*?>(.*?)</i>.*?title="(.*?)".*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>',re.S)
proxies = {
	'http':'http://127.0.0.1:10809',
	'https':'http://127.0.0.1:10809'
}
def get_one_page_url(url):
	"""获得一个网页的源码,使用代理"""
	headers={
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36' 
	}
	result = requests.get(url,headers=headers,proxies=proxies)
	if result.status_code == 200:
		return result.text
	else:
		return None
def store_html(html_txt,filename):
	"""存储一个网页的源码"""
	with open(filename,'w',encoding='utf-8') as f:
		f.write(html_txt)
def get_store_html(filename):
	"""获取文件的内容"""
	with open(filename,'r',encoding='utf-8') as f:
		html = f.read()
	return html
def store_10_html():
	"""存储十个网页的源码"""
	for i in range(1,2):
		url = maoyan_url_base + str(i*10)
		filename = f"maoyan/maoyan_page{i}.txt"
		html = get_one_page_url(url)
		store_html(html,filename)
		time.sleep(1)


def scrap_web(filename):
	"""处理源代码的排名,电影名等,返回字典格式"""
	html = get_store_html(filename)
	results = re.findall(pattern,html)
	# 1为排名,2为电影名,3为主演名
	for result in results:
		# print(result.group(),result.group(2),result.group(3).strip(),result.group(4))
		score =result[4]+result[5]
		score = score.strip()
		print(result[0],result[1],result[2].strip(),result[3],score)
		# yield{
		# 	'index':result[0],
		# 	'title':result[1],
		# 	'actor':result[2].strip()[3:],
		# 	'time':result[3][5:],
		# 	'score':score
		# }

# def scrap_10_webs():
# 	"""获取十个网站的源码"""
# 	for i in range(10):
# 		filename = f"maoyan/maoyan_page{i}.txt"
# 		content = scrap_web(filename)
# 		scrap_web(filename)
def write_to_json(content):
	with open('result.txt','a',encoding='utf-8') as f:
		f.write(json.dumps(content,ensure_ascii=False)+'\n')
def read_10_txts():
	for i in range(10):
		filename = f"maoyan/maoyan_page{i}.txt"
		for item in scrap_web(filename):
			write_to_json(item)

if __name__ == "__main__":
	scrap_web('maoyan/maoyan_page2.txt')

``
原代码里的第25条是和别的代码一样的格式,就是爬不出来 有没有遇到同样问题的大佬呢?
网页源代码
` <dd>
                        <i class="board-index board-index-25">25</i>
    <a href="/films/267" title="泰坦尼克号" class="image-link" data-act="boarditem-click" data-val="{movieId:267}">
      <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
      <img data-src="https://p0.meituan.net/moviemachine/e7dd6b1f77fba08c1f20a3b20b156621642576.jpg@160w_220h_1e_1c" alt="泰坦尼克号" class="board-img" />
    </a>
    <div class="board-item-main">
      <div class="board-item-content">
              <div class="movie-item-info">
        <p class="name"><a href="/films/267" title="泰坦尼克号" data-act="boarditem-click" data-val="{movieId:267}">泰坦尼克号</a></p>
        <p class="star">
                主演:莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩
        </p>
<p class="releasetime">上映时间:1998-04-03</p>    </div>
    <div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">4</i></p>        
    </div>

      </div>
    </div>

                </dd>
                <dd>
                        <i class="board-index board-index-26">26</i>
    <a href="/films/899" title="当幸福来敲门" class="image-link" data-act="boarditem-click" data-val="{movieId:899}">
      <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
      <img data-src="https://p0.meituan.net/moviemachine/e5daa8748733820faab91102bd0bc4507730353.jpg@160w_220h_1e_1c" alt="当幸福来敲门" class="board-img" />
    </a>
    <div class="board-item-main">
      <div class="board-item-content">
              <div class="movie-item-info">
        <p class="name"><a href="/films/899" title="当幸福来敲门" data-act="boarditem-click" data-val="{movieId:899}">当幸福来敲门</a></p>
        <p class="star">
                主演:威尔·史密斯,贾登·史密斯,坦迪·牛顿
        </p>
<p class="releasetime">上映时间:2008-01-17</p>    </div>
    <div class="movie-item-number score-num">
<p class="score"><i class="integer">9.</i><i class="fraction">3</i></p>        
    </div>

      </div>
    </div>

                </dd>`

BroMachee avatar May 06 '21 07:05 BroMachee