python實(shí)現(xiàn)百度文庫(kù)自動(dòng)化爬取
可以下載doc,ppt,pdf.對(duì)于doc文檔可以下載,doc中的表格無(wú)法下載,圖片格式的文檔也可以下載.ppt和pdf是先下載圖片再放到ppt中.只要是可以預(yù)覽的都可以下載。
已有功能 將可以預(yù)覽的word文檔下載為word文檔,如果文檔是掃描件,同樣支持. 將可以預(yù)覽的ppt和pdf下載為不可編輯的ppt,因?yàn)榫W(wǎng)頁(yè)上只有圖片,所以理論上無(wú)法下載可編輯的版本.環(huán)境安裝pip install requestspip install my_fake_useragentpip install python-docxpip install opencv-pythonpip install python-pptxpip install seleniumpip install scrapy
本項(xiàng)目使用的是chromedriver控制chrome瀏覽器進(jìn)行數(shù)據(jù)爬取的的,chromedriver的版本和chrome需要匹配
Windows用看這里1. 如果你的chrome瀏覽器版本恰好是87.0.4280,那么恭喜你,你可以直接看使用方式了,因?yàn)槲蚁螺d的chromedriver也是這個(gè)版本
2. 如果不是,你需要查看自己的chrome瀏覽器版本,然后到chromedriver下載地址:http://npm.taobao.org/mirrors/chromedriver/ 這個(gè)地址下載對(duì)應(yīng)版本的chromedriver,比如你的瀏覽器版本是87.0.4280,你就可以找到87.0.4280.20/這個(gè)鏈接,如果你是windows版本然后選擇chromedriver_win32.zip進(jìn)行下載解壓。千萬(wàn)不要下載LASEST——RELEASE87.0.4280這個(gè)鏈接,這個(gè)鏈接沒(méi)有用,之前有小伙伴走過(guò)彎路的,注意一下哈。
3. 用解壓好的chromedriver.exe替換原有文件,然后跳到使用方式
ubuntu用戶(hù)看這里講道理,你已經(jīng)用ubuntu了,那位就默認(rèn)你是大神,你只要根據(jù)chrome的版本下載對(duì)應(yīng)的chromdriver(linux系統(tǒng)的),然后把chromedriver的路徑改稱(chēng)你下載解壓的文件路徑就好了,然后跳到使用方式。哈哈哈,我這里就偷懶不講武德啦
使用方式:把代碼中的url改為你想要下載的鏈接地址,腳本會(huì)自動(dòng)文檔判斷類(lèi)型,并把在當(dāng)前目錄新建文件夾并把文件下載到當(dāng)前目錄。
主要代碼import osimport timefrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesfrom scrapy import Selectorimport requestsfrom my_fake_useragent import UserAgentimport docxfrom docx.shared import Inchesimport cv2from pptx import Presentationfrom pptx.util import Inches#dows是的chromedriverchromedriver_path = './chromedriver.exe'#用ubuntu的chromedriver# chromedriver_path = './chromedriver'doc_dir_path = './doc'ppt_dir_path = './ppt'# url = 'https://wenku.baidu.com/view/4410199cb0717fd5370cdc2e.html?fr=search'# doc_txt p# url = 'https://wenku.baidu.com/view/4d18916f7c21af45b307e87101f69e314332fa36.html' # doc_txt span# url = 'https://wenku.baidu.com/view/dea519c7e53a580216fcfefa.html?fr=search' # doc_txt span br# url = ’https://wk.baidu.com/view/062edabeb6360b4c2e3f5727a5e9856a5712262d?pcf=2&bfetype=new’ # doc_img# url = 'https://wenku.baidu.com/view/2af6de34a7e9856a561252d380eb6294dd88228d'# vip限定doc# url = 'https://wenku.baidu.com/view/3de365cc6aec0975f46527d3240c844769eaa0aa.html?fr=search' #ppt# url = 'https://wenku.baidu.com/view/18a8bc08094e767f5acfa1c7aa00b52acec79c55'#pdf# url = 'https://wenku.baidu.com/view/bbe27bf21b5f312b3169a45177232f60dccce772'# url = 'https://wenku.baidu.com/view/5cb11d096e1aff00bed5b9f3f90f76c660374c24.html?fr=search'# url = 'https://wenku.baidu.com/view/71f9818fef06eff9aef8941ea76e58fafab045a6.html'# url = 'https://wenku.baidu.com/view/ffc6b32a68eae009581b6bd97f1922791788be69.html'url = 'https://wenku.baidu.com/view/d4d2e1e3122de2bd960590c69ec3d5bbfd0adaa6.html'class DownloadImg(): def __init__(self):self.ua = UserAgent() def download_one_img(self, img_url, saved_path):# 下載圖片header = { 'User-Agent': '{}'.format(self.ua.random().strip()), ’Connection’: ’close’}r = requests.get(img_url, headers=header, stream=True)print('請(qǐng)求圖片狀態(tài)碼 {}'.format(r.status_code)) # 返回狀態(tài)碼if r.status_code == 200: # 寫(xiě)入圖片 with open(saved_path, mode='wb') as f:f.write(r.content) print('download {} success!'.format(saved_path))del rreturn saved_pathclass StartChrome(): def __init__(self):mobile_emulation = {'deviceName': 'Galaxy S5'}capabilities = DesiredCapabilities.CHROMEcapabilities[’loggingPrefs’] = {’browser’: ’ALL’}options = webdriver.ChromeOptions()options.add_experimental_option('mobileEmulation', mobile_emulation)self.brower = webdriver.Chrome(executable_path=chromedriver_path, desired_capabilities=capabilities, chrome_options=options)# 啟動(dòng)瀏覽器,打開(kāi)需要下載的網(wǎng)頁(yè)self.brower.get(url)self.download_img = DownloadImg() def click_ele(self, click_xpath):# 單擊指定控件click_ele = self.brower.find_elements_by_xpath(click_xpath)if click_ele: click_ele[0].location_once_scrolled_into_view # 滾動(dòng)到控件位置 self.brower.execute_script(’arguments[0].click()’, click_ele[0]) # 單擊控件,即使控件被遮擋,同樣可以單擊 def judge_doc(self, contents):# 判斷文檔類(lèi)別p_list = ’’.join(contents.xpath('./text()').extract())span_list = ’’.join(contents.xpath('./span/text()').extract())# # if span_list# if len(span_list)>len(p_list):# xpath_content_one = './br/text()|./span/text()|./text()'# elif len(span_list)<len(p_list):# # xpath_content_one = './br/text()|./text()'# xpath_content_one = './br/text()|./span/text()|./text()'if len(span_list)!=len(p_list): xpath_content_one = './br/text()|./span/text()|./text()'else: xpath_content_one = './span/img/@src'return xpath_content_one def create_ppt_doc(self, ppt_dir_path, doc_dir_path):# 點(diǎn)擊關(guān)閉開(kāi)通會(huì)員按鈕xpath_close_button = '//div[@class=’na-dialog-wrap show’]/div/div/div[@class=’btn-close’]'self.click_ele(xpath_close_button)# 點(diǎn)擊繼續(xù)閱讀xpath_continue_read_button = '//div[@class=’foldpagewg-icon’]'self.click_ele(xpath_continue_read_button)# 點(diǎn)擊取消打開(kāi)百度app按鈕xpath_next_content_button = '//div[@class=’btn-wrap’]/div[@class=’btn-cancel’]'self.click_ele(xpath_next_content_button)# 循環(huán)點(diǎn)擊加載更多按鈕,直到顯示全文click_count = 0while True: # 如果到了最后一頁(yè)就跳出循環(huán) if self.brower.find_elements_by_xpath('//div[@class=’pagerwg-loadSucc hide’]') or self.brower.find_elements_by_xpath('//div[@class=’pagerwg-button’ and @style=’display: none;’]'):break # 點(diǎn)擊加載更多 xpath_loading_more_button = '//span[@class=’pagerwg-arrow-lower’]' self.click_ele(xpath_loading_more_button) click_count += 1 print('第{}次點(diǎn)擊加載更多!'.format(click_count)) # 等待一秒,等瀏覽器加載 time.sleep(1.5)# 獲取html內(nèi)容sel = Selector(text=self.brower.page_source)#判斷文檔類(lèi)型xpath_content = '//div[@class=’content singlePage wk-container’]/div/p/img/@data-loading-src|//div[@class=’content singlePage wk-container’]/div/p/img/@data-src'contents = sel.xpath(xpath_content).extract()if contents:#如果是ppt self.create_ppt(ppt_dir_path, sel)else:#如果是doc self.create_doc(doc_dir_path, sel)# a = 3333# return sel def create_ppt(self, ppt_dir_path, sel):# 如果文件夾不存在就創(chuàng)建一個(gè)if not os.path.exists(ppt_dir_path): os.makedirs(ppt_dir_path)SLD_LAYOUT_TITLE_AND_CONTENT = 6 # 6代表ppt模版為空prs = Presentation() # 實(shí)例化ppt# # 獲取完整html# sel = self.get_html_data()# 獲取標(biāo)題xpath_title = '//div[@class=’doc-title’]/text()'title = ''.join(sel.xpath(xpath_title).extract()).strip()# 獲取內(nèi)容xpath_content_p = '//div[@class=’content singlePage wk-container’]/div/p/img'xpath_content_p_list = sel.xpath(xpath_content_p)xpath_content_p_url_list=[]for imgs in xpath_content_p_list: xpath_content = './@data-loading-src|./@data-src|./@src' contents_list = imgs.xpath(xpath_content).extract() xpath_content_p_url_list.append(contents_list)img_path_list = [] # 保存下載的圖片路徑,方便后續(xù)圖片插入ppt和刪除圖片# 下載圖片到指定目錄for index, content_img_p in enumerate(xpath_content_p_url_list): p_img_path_list=[] for index_1,img_one in enumerate(content_img_p):one_img_saved_path = os.path.join(ppt_dir_path, '{}_{}.jpg'.format(index,index_1))self.download_img.download_one_img(img_one, one_img_saved_path)p_img_path_list.append(one_img_saved_path) p_img_max_shape = 0 for index,p_img_path in enumerate(p_img_path_list):img_shape = cv2.imread(p_img_path).shapeif p_img_max_shape<img_shape[0]: p_img_max_shape = img_shape[0] index_max_img = index img_path_list.append(p_img_path_list[index_max_img])print(img_path_list)# 獲取下載的圖片中最大的圖片的尺寸img_shape_max=[0,0]for img_path_one in img_path_list: img_path_one_shape = cv2.imread(img_path_one).shape if img_path_one_shape[0]>img_shape_max[0]:img_shape_max = img_path_one_shape# 把圖片統(tǒng)一縮放最大的尺寸for img_path_one in img_path_list: cv2.imwrite(img_path_one,cv2.resize(cv2.imread(img_path_one),(img_shape_max[1],img_shape_max[0])))# img_shape_path = img_path_list[0]# 獲得圖片的尺寸# img_shape = cv2.imread(img_shape_path).shape# 把像素轉(zhuǎn)換為ppt中的長(zhǎng)度單位emu,默認(rèn)dpi是720# 1厘米=28.346像素=360000# 1像素 = 12700emuprs.slide_width = img_shape_max[1] * 12700 # 換算單位prs.slide_height = img_shape_max[0] * 12700for img_path_one in img_path_list: left = Inches(0) right = Inches(0) # width = Inches(1) slide_layout = prs.slide_layouts[SLD_LAYOUT_TITLE_AND_CONTENT] slide = prs.slides.add_slide(slide_layout) pic = slide.shapes.add_picture(img_path_one, left, right, ) print('insert {} into pptx success!'.format(img_path_one)) # os.remove(img_path_one)for root,dirs,files in os.walk(ppt_dir_path): for file in files:if file.endswith('.jpg'): img_path = os.path.join(root,file) os.remove(img_path)prs.save(os.path.join(ppt_dir_path, title + '.pptx'))print('download {} success!'.format(os.path.join(ppt_dir_path, title + '.pptx'))) def create_doc(self, doc_dir_path, sel):# 如果文件夾不存在就創(chuàng)建一個(gè)if not os.path.exists(doc_dir_path): os.makedirs(doc_dir_path)# # 獲取完整html# sel = self.get_html_data()# 獲取標(biāo)題xpath_title = '//div[@class=’doc-title’]/text()'title = ''.join(sel.xpath(xpath_title).extract()).strip()document = docx.Document() # 創(chuàng)建word文檔document.add_heading(title, 0) # 添加標(biāo)題# 獲取文章內(nèi)容xpath_content = '//div[contains(@data-id,’div_class_’)]//p'# xpath_content = '//div[contains(@data-id,’div_class_’)]/p'contents = sel.xpath(xpath_content)# 判斷內(nèi)容類(lèi)別xpath_content_one = self.judge_doc(contents)if xpath_content_one.endswith('text()'): # 如果是文字就直接爬 for content_one in contents:one_p_list = content_one.xpath(xpath_content_one).extract()p_txt = ''for p in one_p_list: if p==' ':p_txt += (’n’+p) else:p_txt += p# content_txt_one = ’*’.join(content_one.xpath(xpath_content_one).extract())pp = document.add_paragraph(p_txt) document.save(os.path.join(doc_dir_path, ’{}.docx’.format(title))) print('download {} success!'.format(title))elif xpath_content_one.endswith('@src'): # 如果是圖片就下載圖片 for index, content_one in enumerate(contents.xpath(xpath_content_one).extract()):# 獲取圖片下載路徑content_img_one_url = ’https:’ + content_one# 保存圖片saved_image_path = self.download_img.download_one_img(content_img_one_url, os.path.join(doc_dir_path,'{}.jpg'.format( index)))document.add_picture(saved_image_path, width=Inches(6)) # 在文檔中加入圖片os.remove(saved_image_path) # 刪除下載的圖片 document.save(os.path.join(doc_dir_path, ’{}.docx’.format(title))) # 保存文檔到指定位置 print('download {} success!'.format(title))if __name__ == '__main__': start_chrome = StartChrome() # start_chrome.create_doc_txt(doc_dir_path) start_chrome.create_ppt_doc(ppt_dir_path, doc_dir_path)項(xiàng)目地址
https://github.com/siyangbing/baiduwenku
以上就是python實(shí)現(xiàn)百度文庫(kù)自動(dòng)化爬取的詳細(xì)內(nèi)容,更多關(guān)于python 爬取百度文庫(kù)的資料請(qǐng)關(guān)注好吧啦網(wǎng)其它相關(guān)文章!
相關(guān)文章:
1. ASP常用日期格式化函數(shù) FormatDate()2. 概述IE和SQL2k開(kāi)發(fā)一個(gè)XML聊天程序3. Ajax獲取php返回json數(shù)據(jù)動(dòng)態(tài)生成select下拉框的實(shí)例4. ThinkPHP5 通過(guò)ajax插入圖片并實(shí)時(shí)顯示(完整代碼)5. 刪除docker里建立容器的操作方法6. asp批量添加修改刪除操作示例代碼7. jsp實(shí)現(xiàn)登錄界面8. msxml3.dll 錯(cuò)誤 800c0019 系統(tǒng)錯(cuò)誤:-2146697191解決方法9. CSS3實(shí)現(xiàn)動(dòng)態(tài)翻牌效果 仿百度貼吧3D翻牌一次動(dòng)畫(huà)特效10. ASP.NET MVC使用異步Action的方法
