我的网站

Python数据采集Selenium、PantomJS浅谈

2021-11-22 08:08分类:猎头公司 阅读:

        不息以来吾觉得用在运维的Selenium、PantomJS是一个重器,        不到迫不得已的时候不要祭出这个大杀器,        但是涉及到JavaScript及Ajax渲染的时候,Requests就十足懵逼了!        比来回过头来重新注视这货,        这个重器用逆倒轻巧了许众。        1.安设Selenium、PantomJS        Selenium能够直接议定pip安设,PantomJS则时一个exe可实走文件,必要下载解压。在行使的时候指定exe的绝对路径即可。        2.Selenium、PantomJS基本竖立from selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesdcap = DesiredCapabilities.PHANTOMJSdcap[ "phantomjs.page.settings.userAgent"] = "Mozilla / 4.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393"# 乞求头纷歧样,自体面的窗口纷歧样,卧槽,坑爹!driver = webdriver.PhantomJS(desired_capabilities=dcap)driver.set_page_load_timeout(10)driver.set_script_timeout(10) # 竖立页面退出时间,异国必要等一个网页加载完了采集# 采集网页源码    try:        driver.get(inurl)        content = driver.page_source        # print(content)        time.sleep(1)    except:        driver.execute_script('window.stop()')driver.close()复制代码        3.Selenium、PantomJS基本操作         倘若你的网络和机子有余益,基本上就不必期待网页渲染,         否则,还必要期待,倘若用time.sleep(),则有点愚昧,#期待页面渲染完善from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC...try:    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))# 等某个标签元素出来,不见鸭子不撒鹰。finally:  # 撒鹰    print(driver.find_element_by_id("content").text)    driver.close()复制代码         或者用try:    elem == driver.find_element_by_tag_name("html")    # 抛出StaleElementReferenceException变态表明elem元素已经消逝了, 律师解答也就表明页面已经跳转了。except StaleElementReferenceException:      return复制代码      其他driver内置函数,能够议定查望源代码或者在pycharm挑示获取。        4.Xpath定位Html标签1.id定位:find_element_by_id(self, id_)2.name定位:find_element_by_name(self, name)3.class定位:find_element_by_class_name(self, name)4.tag定位:find_element_by_tag_name(self, name)5.link定位:find_element_by_link_text(self, link_text)6.partial_link定位find_element_by_partial_link_text(self, link_text)7.xpath定位:find_element_by_xpath(self, xpath)8.css定位:find_element_by_css_selector(self, css_selector)9.id复数定位find_elements_by_id(self, id_)10.name复数定位find_elements_by_name(self, name)11.class复数定位find_elements_by_class_name(self, name)12.tag复数定位find_elements_by_tag_name(self, name)13.link复数定位find_elements_by_link_text(self, text)14.partial_link复数定位find_elements_by_partial_link_text(self, link_text)15.xpath复数定位find_elements_by_xpath(self, xpath)16.css复数定位find_elements_by_css_selector(self, css_selector17.find_element(self, by='id', value=None)18.find_elements(self, by='id', value=None)复制代码         其中element手段定位到是是单数,是直接定位到元素;elements手段是复数,这个学过英文的都清新,定位到的是一组元素,返回的是list队列。可参照Re函数中的findall理解。          5.完善例子          这个例子属于标准化操作,在实际中能够正当简化,并结相符上面的Xpath定位完善。from selenium import webdriverimport timefrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities dcap = dict(DesiredCapabilities.PHANTOMJS)dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap)driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")time.sleep(3)print(driver.find_element_by_id("content").text)driver.close()#竖立PHANTOMJS的USER-AGENTfrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities dcap = dict(DesiredCapabilities.PHANTOMJS)dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" driver = webdriver.PhantomJS(executable_path='./phantomjs.exe', desired_capabilities=dcap)driver.get("http://dianping.com/")cap_dict = driver.desired_capabilities  #查望一切可用的desired_capabilities属性。for key in cap_dict:    print('%s: %s' % (key, cap_dict[key]))print(driver.current_url)driver.quit()#期待页面渲染完善from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECdriver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")try:    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))finally:    print(driver.find_element_by_id("content").text)    driver.close()#处理Javascript重定向from selenium import webdriverimport timefrom selenium.webdriver.remote.webelement import WebElementfrom selenium.common.exceptions import StaleElementReferenceExceptiondef waitForLoad(driver):    elem = driver.find_element_by_tag_name("html")    count = 0    while True:        count += 1        if count > 20:            print("Timing out after 10 seconds and returning")            return        time.sleep(.5)        try:            elem == driver.find_element_by_tag_name("html")        except StaleElementReferenceException:            returndriver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")waitForLoad(driver)print(driver.page_source)######from selenium import webdriverfrom selenium.webdriver.remote.webelement import WebElementfrom selenium.webdriver import ActionChainsdriver = webdriver.PhantomJS(executable_path='phantomjs/bin/phantomjs')driver.get('http://pythonscraping.com/pages/javascript/draggableDemo.html')print(driver.find_element_by_id("message").text)element = driver.find_element_by_id("draggable")target = driver.find_element_by_id("div2")actions = ActionChains(driver)actions.drag_and_drop(element, target).perform()print(driver.find_element_by_id("message").text)########截屏driver.get_screenshot_as_file('tmp/pythonscraping.png')#####登陆知乎,然后能自动点击页面下方的“更众”,以载入更众的内容from selenium import webdriverfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver import ActionChainsimport timeimport sysdriver = webdriver.PhantomJS(executable_path='C:\Users\Gentlyguitar\Desktop\phantomjs-1.9.7-windows\phantomjs.exe')driver.get("http://www.zhihu.com/#signin")#driver.find_element_by_name('email').send_keys('your email')driver.find_element_by_xpath('//input[@name="password"]').send_keys('your password')#driver.find_element_by_xpath('//input[@name="password"]').send_keys(Keys.RETURN)time.sleep(2)driver.get_screenshot_as_file('show.png')#driver.find_element_by_xpath('//button[@class="sign-button"]').click()driver.find_element_by_xpath('//form[@class="zu-side-login-box"]').submit()try:    #期待页面加载完毕    dr=WebDriverWait(driver,5)    dr.until(lambda the_driver:the_driver.find_element_by_xpath('//a[@class="zu-top-nav-userinfo "]').is_displayed())except:    print('登录战败')    sys.exit(0)driver.get_screenshot_as_file('show.png')#user=driver.find_element_by_class_name('zu-top-nav-userinfo ')#webdriver.ActionChains(driver).move_to_element(user).perform() #移动鼠标到吾的用户名loadmore=driver.find_element_by_xpath('//a[@id="zh-load-more"]')actions = ActionChains(driver)actions.move_to_element(loadmore)actions.click(loadmore)actions.perform()time.sleep(2)driver.get_screenshot_as_file('show.png')print(driver.current_url)print(driver.page_source)driver.quit()复制代码,

郑重声明:文章来源于网络,仅作为参考,如果网站中图片和文字侵犯了您的版权,请联系我们处理!

上一篇:如何相符适地把「offer」翻译成中文?

下一篇:大班美术教案:汽车设计师

相关推荐

返回顶部