应用示例

1.通过selenium爬取"https://stocksnap.io/"上的图片。

简要说明,通过selenium打开firefox浏览器,导航到指定网址;然后定位到图片源地址,并保存;然后调用urllib模块,下载图片到指定位置。

代码如下:

    #!/bin/bash/env python
    # -*- coding: utf-8 -*-
    import time
    import urllib
    import urllib2
    import re
    import os
    from selenium import webdriver

    # 爬取页面地址  
    url = "https://stocksnap.io/"

    class GetpicHtml(object):

        def __init__(self):
            self.driver =  webdriver.Firefox()

        def gethtml(self):
            self.driver.maximize_window()
            img_url_dic = {}
            self.driver.get(url)

        # 模拟滚动窗口以浏览下载更多图片  
            pos = 0
            m = 0 # 图片编号  
            for i in range(10):
                pos += i*500 # 每次下滚500  
                js = "document.documentElement.scrollTop=%d" % pos
                self.driver.execute_script(js)
                time.sleep(1)
                elemlist= self.driver.find_elements_by_xpath("//*[@id='main']/a[*]/img")
                for elem in elemlist:
                    img_url = elem.get_attribute('src')
                    if img_url != None and not img_url_dic.has_key(img_url):                
                        img_url_dic[m] = img_url
                        m += 1
            self.driver.close()
            return img_url_dic

    class DownloadFile(object):

        def __init__(self, url_list, local_dir):
            self.url_list = url_list
            self.local_dir = local_dir

        def downloadfile(self,localname, url):
            try:
                response = urllib2.urlopen(url)
                urllib.urlretrieve(url, localname)
            except:
                print("%s Download error:" %localname)
                exit(1)

        def control(self):
            for k, v in self.url_list.iteritems():
                filename = re.findall(r"https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/(.+?).jpg", v)[0] + ".jpg"
                localname = os.path.join(self.local_dir, filename)
                self.downloadfile(localname, v)


    if __name__ == '__main__':
        getsrc  = GetpicHtml()
        urllist = getsrc.gethtml()
        downloadpic = DownloadFile(urllist, "/home/isoft_lp/tmp")

代码说明:

driver.execute_script(js)

调用execute_scripts执行JavaScript脚本,随后会重点说明该方法;

elemlist= self.driver.find_elements_by_xpath("//*[@id='main']/a[*]/img")

通过xpath获取图片元素;

元素内容”

"<img src="https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/9XCA8GIDBS.jpg" height="280" width="420">";

img_url = elem.get_attribute('src')

获取图片源地址;

filename = re.findall(r"https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/(.+?).jpg", v)[0] + ".jpg"

获取图片文件的原名字;

response = urllib2.urlopen(url)
urllib.urlretrieve(url, localname)

指定url和本地地址,进行文件的下载

2.自动登陆https://github.com/

简要说明,通过selenium打开firefox浏览器,导航到指定网站,点击Sign in,自动输入账号密码,实现自动登陆。

代码如下:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash/env python
# -*- coding: utf-8 -*-
import time
import os
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

# 登陆网址
url = "https://github.com/"

# 账号密码

loginname = "xxxx@xx"
passwordvalue = "xxxxx"

class AutoLoginGithub(object):

    def __init__(self):
        self.driver =  webdriver.Firefox()

    # 导航到github登陆页面
    def getloginpage(self):
        self.driver.get(url)
        # 定位signin按钮
        signin = self.driver.find_element_by_xpath('/html/body/header/div/div/div/a[2]')
        # 点击signin按钮,进入登陆页面
        actions = ActionChains(self.driver)
        # 移动光标至sigin按钮 
        actions.move_to_element(signin)
        actions.click(signin)
        actions.perform()
        return self.driver

    def autologin(self):
       # 获取新的页面对象
        driver = self.getloginpage()
        # 定位账号输入框
        login = driver.find_element_by_id("login_field")
        # 输入账号信息
        login.send_keys(loginname)
        # 定位密码输入框
        password = driver.find_element_by_id("password")
        # 输入密码信息
        password.send_keys(passwordvalue)
        # 定位登陆按钮
        dologin = driver.find_element_by_xpath("//*[@id='login']/form/div[4]/input[3]")
        # 点击登陆按钮
        actions = ActionChains(driver)
        actions.move_to_element(dologin)
        actions.click(dologin)
        actions.perform()
        time.sleep(3)
        driver.close()

if __name__ == '__main__':
    autologin = AutoLoginGithub()
    autologin.autologin()

代码说明:

以下代码,实现按钮的点击

actions = ActionChains(self.driver)
# 移动光标至sigin按钮 
actions.move_to_element(signin)
actions.click(signin)
actions.perform()

未完待续

上一篇Selenium主题10 下一篇selenium-Sample(二)

by 李鹏