爬取bibi的已更新,代码不能使用,请问怎么破解bibi的搜索框,::before
确实抓不了了,不断弹出登录框,是不是要模拟登陆才行呢。
from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import xlwt
browser = webdriver.Chrome() browser.get("https://www.bilibili.com/") WAIT = WebDriverWait(browser, 10) browser.set_window_size(1400, 900) book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('蔡徐坤篮球', cell_overwrite_ok=True) sheet.write(0, 0, '名称') sheet.write(0, 1, '地址') sheet.write(0, 2, '描述') sheet.write(0, 3, '观看次数') sheet.write(0, 4, '弹幕数') sheet.write(0, 5, '发布时间') n = 1
def search(): try: print('开始访问b站....') browser.get("https://www.bilibili.com/")
input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#nav_searchform > input")))
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div/form/div/button')))
input.send_keys('蔡徐坤 篮球')
submit.click()
print('跳转')
all_h = browser.window_handles
browser.switch_to.window(all_h[1])
get_source()
total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,
"#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button")))
return int(total.text)
except TimeoutException:
return search()
def next_page(page_num): try: print('获取下一页数据') next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button'))) next_btn.click() WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page_num))) get_source()
except TimeoutException:
return next_page(page_num)
def save_to_excel(soup): list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')
for item in list:
item_title = item.find('a').get('title')
item_link = item.find('a').get('href')
item_dec = item.find(class_='des hide').text
item_view = item.find(class_='so-icon watch-num').text
item_biubiu = item.find(class_='so-icon hide').text
item_date = item.find(class_='so-icon time').text
print('爬取:' + item_title)
global n
sheet.write(n, 0, item_title)
sheet.write(n, 1, item_link)
sheet.write(n, 2, item_dec)
sheet.write(n, 3, item_view)
sheet.write(n, 4, item_biubiu)
sheet.write(n, 5, item_date)
n = n + 1
def get_source(): WAIT.until(EC.presence_of_element_located( (By.CSS_SELECTOR, '#all-list > div.flow-loader > div.filter-wrap')))
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print('到这')
save_to_excel(soup)
def main(): try: total = search() print(total)
for i in range(2, int(total+1)):
next_page(i)
finally:
browser.close()
if name == 'main': main() book.save('蔡徐坤篮球.xlsx') 用我这个
确实抓不了了,不断弹出登录框,是不是要模拟登陆才行呢。
我也遇到这个问题,程序只能反复刷新首页,不知道要怎么改呀?
获取到搜索的input框后需要先.click()然后再.send_keys()
`def search(): try: print('start visit bilibili...') browser.get('https://www.bilibili.com/')
search_input = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#nav-searchform > div.nav-search-content > input")))
search_input.click()
search_input.send_keys('蔡徐坤篮球')
search_submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="nav-searchform"]/div[2]')))
search_submit.click()
print('jump to new window')
all_h = browser.window_handles
browser.switch_to.window(all_h[1])
except TimeoutException:
return search()`