应用示例
1.通过selenium爬取"https://stocksnap.io/"上的图片。
简要说明,通过selenium打开firefox浏览器,导航到指定网址;然后定位到图片源地址,并保存;然后调用urllib模块,下载图片到指定位置。
代码如下:
#!/bin/bash/env python
# -*- coding: utf-8 -*-
import time
import urllib
import urllib2
import re
import os
from selenium import webdriver
# 爬取页面地址
url = "https://stocksnap.io/"
class GetpicHtml(object):
def __init__(self):
self.driver = webdriver.Firefox()
def gethtml(self):
self.driver.maximize_window()
img_url_dic = {}
self.driver.get(url)
# 模拟滚动窗口以浏览下载更多图片
pos = 0
m = 0 # 图片编号
for i in range(10):
pos += i*500 # 每次下滚500
js = "document.documentElement.scrollTop=%d" % pos
self.driver.execute_script(js)
time.sleep(1)
elemlist= self.driver.find_elements_by_xpath("//*[@id='main']/a[*]/img")
for elem in elemlist:
img_url = elem.get_attribute('src')
if img_url != None and not img_url_dic.has_key(img_url):
img_url_dic[m] = img_url
m += 1
self.driver.close()
return img_url_dic
class DownloadFile(object):
def __init__(self, url_list, local_dir):
self.url_list = url_list
self.local_dir = local_dir
def downloadfile(self,localname, url):
try:
response = urllib2.urlopen(url)
urllib.urlretrieve(url, localname)
except:
print("%s Download error:" %localname)
exit(1)
def control(self):
for k, v in self.url_list.iteritems():
filename = re.findall(r"https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/(.+?).jpg", v)[0] + ".jpg"
localname = os.path.join(self.local_dir, filename)
self.downloadfile(localname, v)
if __name__ == '__main__':
getsrc = GetpicHtml()
urllist = getsrc.gethtml()
downloadpic = DownloadFile(urllist, "/home/isoft_lp/tmp")
代码说明:
driver.execute_script(js)
调用execute_scripts执行JavaScript脚本,随后会重点说明该方法;
elemlist= self.driver.find_elements_by_xpath("//*[@id='main']/a[*]/img")
通过xpath获取图片元素;
元素内容”
"<img src="https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/9XCA8GIDBS.jpg" height="280" width="420">";
img_url = elem.get_attribute('src')
获取图片源地址;
filename = re.findall(r"https://d2lm6fxwu08ot6.cloudfront.net/img-thumbs/280h/(.+?).jpg", v)[0] + ".jpg"
获取图片文件的原名字;
response = urllib2.urlopen(url)
urllib.urlretrieve(url, localname)
指定url和本地地址,进行文件的下载
2.自动登陆https://github.com/
简要说明,通过selenium打开firefox浏览器,导航到指定网站,点击Sign in,自动输入账号密码,实现自动登陆。
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/bin/bash/env python
# -*- coding: utf-8 -*-
import time
import os
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
# 登陆网址
url = "https://github.com/"
# 账号密码
loginname = "xxxx@xx"
passwordvalue = "xxxxx"
class AutoLoginGithub(object):
def __init__(self):
self.driver = webdriver.Firefox()
# 导航到github登陆页面
def getloginpage(self):
self.driver.get(url)
# 定位signin按钮
signin = self.driver.find_element_by_xpath('/html/body/header/div/div/div/a[2]')
# 点击signin按钮,进入登陆页面
actions = ActionChains(self.driver)
# 移动光标至sigin按钮
actions.move_to_element(signin)
actions.click(signin)
actions.perform()
return self.driver
def autologin(self):
# 获取新的页面对象
driver = self.getloginpage()
# 定位账号输入框
login = driver.find_element_by_id("login_field")
# 输入账号信息
login.send_keys(loginname)
# 定位密码输入框
password = driver.find_element_by_id("password")
# 输入密码信息
password.send_keys(passwordvalue)
# 定位登陆按钮
dologin = driver.find_element_by_xpath("//*[@id='login']/form/div[4]/input[3]")
# 点击登陆按钮
actions = ActionChains(driver)
actions.move_to_element(dologin)
actions.click(dologin)
actions.perform()
time.sleep(3)
driver.close()
if __name__ == '__main__':
autologin = AutoLoginGithub()
autologin.autologin()
|
代码说明:
以下代码,实现按钮的点击
actions = ActionChains(self.driver)
# 移动光标至sigin按钮
actions.move_to_element(signin)
actions.click(signin)
actions.perform()
未完待续