AI01, Practical data handling
Back to the previous page |page management
List of posts to read before reading this article
Contents
Datasets
MNIST
https://github.com/myleott/mnist_png
File I/O
json
Image
jpg
$ pip install matplotlib
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread('input_image.jpg') # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.jpg') # save image
png
$ pip install matplotlib
$ pip install pillow
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread('input_image.png', 0) # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.png') # save image
Table
import pandas as pd
df = pd.read_csv('input_table.csv') # load table
df.to_excel('output_table.xlsx') # save table
Text
with open('input_text.txt','r') as f: # load text
text = f.read()
with open('output.txt','w') as f: # save text
f.write(text)
Sound
from gtts import gTTS
import os
text = "Global warming is the long-term rise in the average temperature of the Earth's climate system"
language = "en"
speech = gTTS(text = text, lang = language, slow = False)
speech.save("text.mp3")
os.system("start text.mp3")
Load data from WEB
Developer tools
F12 : Elements(Inspector, Ctrl + Shift + c), Networks
/robots.txt
Scraping : basic
urllib
API
installation
$ pip install urllib3
$ pip install fake-useragent
urlretrieve(from urllib.request)
download file
from urllib.request import urlretrieve
# from : file url
img_url = 'https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png'
html_url = 'https://www.google.com/'
# to : path
img_save_path = r'S:\workspace\2020-01-19\winscp.jpg'
html_save_path = r'S:\workspace\2020-01-19\index.html'
# download file
img_file, img_header = urlretrieve(img_url,img_save_path); print(img_header)
html_file, html_header = urlretrieve(html_url, html_save_path); print(html_header)
OUTPUT
handling error
urlopen(from urllib.request)
response
from urllib.request import urlopen
file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
response = urlopen(file_url)
print('header_info : {}'.format(response.info()))
print('http_status_code : {}'.format(response.getcode()))
print('geturl : {}'.format(response.geturl()))
print('status : {}'.format(response.status))
print('headers : {}'.format(response.getheaders()))
print('contents : {}'.format(response.read(10))) # response binary data, response.content in module 'requests'
print('contents decode: {}'.format(response.read(10).decode('utf-8'))) # response data, response.text in module 'requests'
save file as an object on python
from urllib.request import urlopen
# from : file url
# to : path
file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
save_path = r"S:\workspace\2020-01-22\winscp.jpg"
# save file as an object on python
response = urlopen(file_url)
header_info = response.info()
http_status_code = response.getcode()
# download file
contents = response.read()
with open(save_path, 'wb') as c:
c.write(contents)
OUTPUT
handling error
import urllib.request as req
from urllib.error import URLError, HTTPError
# from : file url
target_url = ["https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png",
"https://google.com"]
# to : path
path_list = [r"S:\workspace\2020-01-22\winscp.jpg",
r"S:\workspace\2020-01-22\index.html"]
# download file
for i, url in enumerate(target_url):
try:
response = req.urlopen(url)
contents = response.read()
print('---------------------------------------------------')
print('Header Info-{} : {}'.format(i, response.info()))
print('HTTP Status Code : {}'.format(response.getcode()))
print('---------------------------------------------------')
with open(path_list[i], 'wb') as c:
c.write(contents)
except HTTPError as e:
print("Download failed.")
print('HTTPError Code : ', e.code)
except URLError as e:
print("Download failed.")
print('URL Error Reason : ', e.reason)
else:
print()
print("Download Succeed.")
requests
installation
$ pip install requests
$ pip install lxml
$ pip install cssselect
Request methods : GET
import requests
response = requests.get("https://www.naver.com")
print(response.text) # response data, response.read().decode('utf-8') in module 'urlopen'
print(response.content) # response binary data, response.read() in module 'urlopen'
print(response.headers) # header
print(response.status_code) # status code
print(response.url) # url
print(response.ok) # ok
print(response.encoding) # encoding
SUPPLEMENT, response.text
It can be used response.iter_lines instead of method ‘response.text’.
import requests
response = requests.get("https://www.naver.com")
#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
print(line)
with session
import requests
session = requests.Session()
response = session.get("https://www.naver.com")
print(response.text)
print(response.content)
print(response.status_code)
print(response.url)
print(response.ok)
print(response.encoding)
session.close()
or
import requests
with requests.Session() as session:
response = session.get("https://www.naver.com")
print(response.text)
print(response.content)
print(response.status_code)
print(response.url)
print(response.ok)
print(response.encoding)
with cookies, headers
import requests
response1 = requests.get("https://httpbin.org/cookies", cookies={'name':'kim'})
response2 = requests.get("https://httpbin.org", headers={'user-agent':'nice-man_1.0.0_win10_ram16_home_chrome'})
print(response1, response1.text)
print(response2, response2.text)
another way carring cookies
import requests
response = requests.get('https://httpbin.org/cookies')
print(response.text)
jar = requests.cookies.RequestsCookieJar()
jar.set('name', 'niceman', domain='httpbin.org', path='/cookies')
response = requests.get('http://httpbin.org/cookies', cookies=jar)
print(response.text)
{
"cookies": {}
}
{
"cookies": {
"name": "niceman"
}
}
with timeout
import requests
response = requests.get('https://github.com', timeout=10)
print(response.text)
with json
import requests
response = requests.get('https://jsonplaceholder.typicode.com/posts/1')
print('.headers : \n',response.headers)
print('.text : \n',response.text)
print('.json() : \n', response.json())
print('.json().keys() : \n', response.json().keys())
print('.json().values() : \n',response.json().values())
import requests
import json
response = requests.get('http://httpbin.org/stream/100', stream=True)
#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
b = json.loads(line); print(b) # type(line) = str, type(b) = dict
for k, v in b.items():
print("Key: {}, Values: {}".format(k, v))
with lxml
with cssselect
import requests
import lxml.html
response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)
for i in root.cssselect('.api_list .api_item a.api_link'):
# i.text_content(), i.get('attr')
url = i.get('href')
name = i.cssselect('.api_logo')[0].get('alt');
print(name, url)
OUTPUT
SUPPLEMENT
response = requests.get('https://www.naver.com/')
print(response)
print(response.content)
<Response [200]>
b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'
with xpath
import requests
import lxml.html
response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)
root.make_links_absolute(response.url)
for i in root.xpath('//ul[@class="api_list"]/li[@class="api_item"]/a[@class="api_link"]'):
url = i.get('href')
name = i.xpath('./img')[0].get('alt')
print(name, url)
OUTPUT
SUPPLEMENT
response = requests.get('https://www.naver.com/')
print(response)
print(response.content)
print(response.url)
<Response [200]>
b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'
https://www.naver.com/
Another request methods : POST, DELETE, PUT:UPDATE, REPLACE (FETCH : UPDATE, MODIFY)
import requests
response = requests.post('http://httpbin.org/post', data={'kim':'stellar'})
print(response.text)
print(response.headers)
import requests
payload1 = {'name': 'kim', 'pay': 'true'}
payload2 = (('name', 'park'), ('pay', 'false'))
response1 = requests.post('http://httpbin.org/post', data=payload1)
response2 = requests.post('http://httpbin.org/post', data=payload2)
print(response1.text)
print(response2.text)
import requests
response = requests.put('http://httpbin.org/put', data={'data': '{"name": "Kim", "grade": "A"}'})
print(response.text)
import requests
response = requests.delete('http://httpbin.org/delete')
print(response.text)
import requests
response = requests.delete('https://jsonplaceholder.typicode.com/posts/1')
print(response.text)
BeautifulSoup
installation
$ pip install beautifulsoup4
Basic
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
print(soup.prettify())
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<h1>
this is h1 area
</h1>
<h2>
this is h2 area
</h2>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">
Tillie
</a>
</p>
<p class="story">
story...
</p>
</body>
</html>
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
h1 = soup.html.body.h1; print(h1, h1.string) # h1 tag
p = soup.html.body.p; print(p, p.string) # p tag
<h1>this is h1 area</h1>, this is h1 area
<p class="title"><b>The Dormouse's story</b></p>, The Dormouse's story
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
p = soup.html.body.p; print('p', p)
p2 = p.next_sibling.next_sibling; print('p2', p2)
p3 = p.next_sibling.next_sibling.next_sibling.next_sibling; print('p3', p3)
p4 = p.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling; print('p4', p4)
p <p class="title"><b>The Dormouse's story</b></p>
p2 <p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
p3 <p class="story">story...</p>
p4 None
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
p = soup.html.body.p
p2 = p.next_sibling.next_sibling
print(list(p2.next_elements))
for i in p2.next_elements:
print(i)
['Once upon a time there were three little sisters\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 'Lacie', '\n', <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', '\n', '\n', <p class="story">story...</p>, 'story...', '\n', '\n', '\n']
Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
<p class="story">story...</p>
story...
FIND
find_all
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
tag_a = soup.find_all("a", class_='sister')
print(tag_a)
for i in tag_a:
print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
Elsie Elsie
Lacie Lacie
Tillie Tillie
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
tag_a = soup.find_all("a", string=["Elsie","Tillie"], id="link1")
print(tag_a)
for i in tag_a:
print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
Elsie Elsie
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
tag_a = soup.find_all("a", limit=2)
print(tag_a)
for i in tag_a:
print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
Elsie Elsie
Lacie Lacie
find
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
tag_a = soup.find("a") # the first tag that was found
print(tag_a)
print(tag_a.text, tag_a.string)
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie Elsie
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
tag_a = soup.find("a", {"class": "sister", "data-io": "link3"}) # multiple condition
print(tag_a)
print(tag_a.text, tag_a.string)
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie Tillie
SELECT
select_one
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
select_b = soup.select_one("p.title > b")
select_idlink1 = soup.select_one("a#link1")
select_valuelink3 = soup.select_one("a[data-io='link3']")
print(select_b, select_b.string)
print(select_idlink1, select_idlink1.string)
print(select_valuelink3, select_valuelink3.string)
<b>The Dormouse's story</b> The Dormouse's story
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> Elsie
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a> Tillie
select
from bs4 import BeautifulSoup
HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""
soup = BeautifulSoup(HTML, 'html.parser')
select_a = soup.select("p.story > a")
select_a2 = soup.select("p.story > a:nth-of-type(2)")
select_classstory = soup.select("p.story")
print(select_a)
print(select_a[0])
print(select_a[1])
print(select_a[2])
print(select_a[0]['href'])
print(select_a[1]['href'])
print(select_a[2]['href'])
print()
print(select_a2)
print(select_a2[0])
print(select_a2[0]['id'])
print()
print(select_classstory)
print(select_classstory[0])
print(select_classstory[1])
print(select_classstory[0]['class'])
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
link2
[<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>, <p class="story">story...</p>]
<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
['story']
Selenium
Installation
$ pip install selenium
Access web(with web driver)
web drivers(chrome, firefox, phantomjs) official download links
without option
from selenium import webdriver
browser = webdriver.Chrome('webdriver/chromedriver.exe') # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()
with option(headless)
from selenium import webdriver
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome('webdriver/chromedriver.exe', options=chrome_options) # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()
result without option
web driver method
print(browser.page_source)
print(browser.session_id)
print(browser.title)
print(browser.current_url)
print(browser.get_cookies())
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="ko"><head>
<meta charset="utf-8" />
<meta name="Referrer" content="origin" />
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=1100" />
<meta name="apple-mobile-web-app-title" content="NAVER" />
<meta name="robots" content="index,nofollow" />
<meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" />
...
...
...
if (window.addEventListener) {
window.addEventListener("load", function() { loadJS(); }, true);
} else if (window.attachEvent) {
window.attachEvent("onload", loadJS);
} else {
window.onload = loadJS;
}
</script>
</body></html>
aa015a79b7acddf96892e138b1e75e31
NAVER
https://www.naver.com/
[{'domain': '.naver.com', 'expiry': 1612652898, 'httpOnly': False, 'name': 'NRTK', 'path': '/', 'secure': False, 'value': 'ag#all_gr#1_ma#-2_si#0_en#0_sp#0'}, {'domain': '.naver.com', 'expiry': 2524640401.665735, 'httpOnly': False, 'name': 'NNB', 'path': '/', 'secure': True, 'value': 'EFLEUDPB5U6V4'}, {'domain': 'www.naver.com', 'expiry': 1581203297.710821, 'httpOnly': True, 'name': 'PM_CK_loc', 'path': '/', 'secure': False, 'value': 'd2d101bc3885853d3f553a325db5c09b55091e808f79a0dfdb0fb274ee3cfd30'}]
Search keyword & screen shot
from selenium import webdriver
browser = webdriver.Chrome('webdriver/chromedriver.exe') # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('https://www.daum.net')
# search keyword
element = browser.find_element_by_css_selector('div.inner_search > input.tf_keyword')
element.send_keys('lion') # search word(input keyword)
element.submit() # form submit
# screen shot
browser.save_screenshot("website_ch1.png") # saving way 1
browser.get_screenshot_as_file("website_ch2.png") # saving way 2
browser.quit()
Click(with Explicitly wait)
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')
# Explicitly wait
WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)
# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
if not v.find('div', class_='ad_header'):
# product name, image, price
print(v.select('p.prod_name > a')[0].text.strip())
print(v.select('a.thumb_link > img')[0]['src'])
print(v.select('p.price_sect > a')[0].text.strip())
browser.quit()
Click(with Implicitly wait)
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')
# Implicitly wait
time.sleep(3); browser.find_element_by_xpath('//*[@id="dlMaker_simple"]/dd/div[2]/button[1]').click()
time.sleep(2); browser.find_element_by_xpath('//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label').click()
time.sleep(3)
# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
if not v.find('div', class_='ad_header'):
# product name, image, price
print(v.select('p.prod_name > a')[0].text.strip())
print(v.select('a.thumb_link > img')[0]['src'])
print(v.select('p.price_sect > a')[0].text.strip())
browser.quit()
Application(click page-number)
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')
WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)
cur_page_num = 1; target_crawl_num = 5
while cur_page_num <= target_crawl_num:
# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
print('Current Page : {}'.format(cur_page_num))
for v in pro_list:
if not v.find('div', class_='ad_header'):
# product name, image, price
print(v.select('p.prod_name > a')[0].text.strip())
print(v.select('a.thumb_link > img')[0]['src'])
print(v.select('p.price_sect > a')[0].text.strip())
cur_page_num += 1 # next page
if cur_page_num > target_crawl_num:
print('Crawling Succeed.')
break
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
time.sleep(4)
browser.quit()
xlsxwriter
$ pip install xlsxwriter
import urllib.request as req
from io import BytesIO
import xlsxwriter
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
workbook = xlsxwriter.Workbook("crawling_result.xlsx")
worksheet = workbook.add_worksheet()
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280) # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')
WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)
cur_page_num = 1; target_crawl_num = 5
ins_cnt = 1 # excel row number
while cur_page_num <= target_crawl_num:
# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
print('Current Page : {}'.format(cur_page_num))
for v in pro_list:
if not v.find('div', class_='ad_header'):
# product name, price
prod_name = v.select('p.prod_name > a')[0].text.strip()
prod_price = v.select('p.price_sect > a')[0].text.strip()
# save excel(text)
worksheet.write('A%s' % ins_cnt, prod_name)
worksheet.write('B%s' % ins_cnt, prod_price)
"""
# product image
img_data = BytesIO(req.urlopen(v.select('a.thumb_link > img')[0]['data-original']).read())
# save excel(image)
worksheet.insert_image('C%s' % ins_cnt, prod_name, {'image_data': img_data})
"""
ins_cnt += 1 # next row
cur_page_num += 1 # next page
if cur_page_num > target_crawl_num:
print('Crawling Succeed.')
break
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
time.sleep(4)
browser.quit()
workbook.close()
Making excel file
import urllib.request as req
from io import BytesIO
import xlsxwriter
workbook = xlsxwriter.Workbook("excel.xlsx")
worksheet = workbook.add_worksheet()
img_data = BytesIO(req.urlopen('http://img.danawa.com/prod_img/500000/866/250/img/5250866_1.jpg?shrink=160:160&_v=20180713162212').read())
worksheet.write('A1', 'prod_name')
worksheet.write('B1', 'prod_price')
worksheet.insert_image('C1', 'prod_name', {'image_data': img_data})
workbook.close()
Scraping : advanced
installation
pip install scrapy
pip install pypiwin32
Scrapy framework
$ scrapy startproject [project_name]
$ cd [project_name]
$ scrapy genspider [spider_name] [Crawling_URL]
$ scrapy crawl [spider_name]
$ cd [project_name]/spiders
$ scrapy runspider [spider_name].py
parse
response.css
getall()
get()
extract()
extract_first()
response.xpath
getall()
get()
extract()
extract_first()
Shell
$ scrapy shell
$ scrapy shell [crawling_url]
$ scrapy shell [crawling_url] --set="ROBOTSTXT_OBEY=False"
Spider
Selectors
Items
Exports
Settings
Pipeline
Scrapy project
Example for scraping
EX1, encar
from urllib.request import urlopen
from urllib.parse import urlparse
# with urlopen
response_1 = urlopen("http://www.encar.com/")
print('type : {}'.format(type(response_1)))
print("geturl : {}".format(response_1.geturl()))
print("status : {}".format(response_1.status))
print("headers : {}".format(response_1.getheaders()))
print("getcode : {}".format(response_1.getcode()))
print("read : {}".format(response_1.read(1).decode('utf-8')))
# with urlparse
response_2 = urlparse('http://www.encar.co.kr?test=test')
print('total parse : {}'.format(response_2))
print('partial parse : {}'.format(response_2.query))
EX2, ipify
import urllib.request
from urllib.parse import urlparse
# request
API = "https://api.ipify.org" # some request url
values = {'format': 'json'} # It is also possible to use text, jsonp instead of json
params = urllib.parse.urlencode(values) # get parameter by encoding
url = API + "?" + params # request url
# response
data = urllib.request.urlopen(url).read() # read response data
text = data.decode("utf-8") # decode read data
print('response : {}'.format(text))
SUPPLEMENT
values = {'format': 'json'} # It is also possible to use text, jsonp instead of json
params = urllib.parse.urlencode(values) # get parameter by encoding
print(params)
format=json
EX3, mois
import urllib.request
import urllib.parse
API = "http://www.mois.go.kr/gpms/view/jsp/rss/rss.jsp"
params = []
for num in [1001, 1012, 1013, 1014]:
params.append(dict(ctxCd=num))
for i in params:
param = urllib.parse.urlencode(i)
url = API + "?" + param
res_data = urllib.request.urlopen(url).read()
contents = res_data.decode("utf-8")
print(contents)
SUPPLEMENT
for i in params:
print(i)
param = urllib.parse.urlencode(i)
print(param)
{'ctxCd': 1001}
ctxCd=1001
{'ctxCd': 1012}
ctxCd=1012
{'ctxCd': 1013}
ctxCd=1013
{'ctxCd': 1014}
ctxCd=1014
EX4, daum finance
import json
import urllib.request as req
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent' : ua.ie,
'referer' : 'https://finance.daum.net/'}
url = "https://finance.daum.net/api/search/ranks?limit=10"
res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')
rank_json = json.loads(res)['data'] # str -> json
for elm in rank_json:
print('rank : {}, trade price : {}, name : {}'.format(elm['rank'], elm['tradePrice'], elm['name']), )
SUPPLEMENT
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.ie)
print(ua.msie)
print(ua.chrome)
print(ua.safari)
print(ua.random)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36
Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36
res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')
{"data":[{"rank":1,"rankChange":0,"symbolCode":"A005930","code":"KR7005930003","name":"삼성전자","tradePrice":57100,"change":"FALL","changePrice":2000,"changeRate":0.0338409475,"chartSlideImage":null,"isNew":false},{"rank":2,"rankChange":2,"symbolCode":"A308170","code":"KR7308170000","name":"센트랄모텍","tradePrice":42450,"change":"RISE","changePrice":5750,"changeRate":0.1566757493,"chartSlideImage":null,"isNew":false},{"rank":3,"rankChange":5,"symbolCode":"A068270","code":"KR7068270008","name":"셀트리온","tradePrice":166500,"change":"FALL","changePrice":4500,"changeRate":0.0263157895,"chartSlideImage":null,"isNew":false},{"rank":4,"rankChange":-1,"symbolCode":"A226440","code":"KR7226440006","name":"한송네오텍","tradePrice":1930,"change":"RISE","changePrice":270,"changeRate":0.1626506024,"chartSlideImage":null,"isNew":false},{"rank":5,"rankChange":0,"symbolCode":"A028300","code":"KR7028300002","name":"에이치엘비","tradePrice":96300,"change":"FALL","changePrice":3500,"changeRate":0.0350701403,"chartSlideImage":null,"isNew":false},{"rank":6,"rankChange":-4,"symbolCode":"A215600","code":"KR7215600008","name":"신라젠","tradePrice":13750,"change":"FALL","changePrice":850,"changeRate":0.0582191781,"chartSlideImage":null,"isNew":false},{"rank":7,"rankChange":0,"symbolCode":"A011000","code":"KR7011000007","name":"진원생명과학","tradePrice":5590,"change":"RISE","changePrice":240,"changeRate":0.0448598131,"chartSlideImage":null,"isNew":true},{"rank":8,"rankChange":-1,"symbolCode":"A091990","code":"KR7091990002","name":"셀트리온헬스케어","tradePrice":55600,"change":"FALL","changePrice":500,"changeRate":0.008912656,"chartSlideImage":null,"isNew":false},{"rank":9,"rankChange":0,"symbolCode":"A045060","code":"KR7045060001","name":"오공","tradePrice":7920,"change":"RISE","changePrice":230,"changeRate":0.0299089727,"chartSlideImage":null,"isNew":false},{"rank":10,"rankChange":-4,"symbolCode":"A036540","code":"KR7036540003","name":"SFA반도체","tradePrice":6190,"change":"RISE","changePrice":340,"changeRate":0.0581196581,"chartSlideImage":null,"isNew":false}]}
EX5, naver image search
import os
import urllib.parse as rep
import urllib.request as req
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
opener = req.build_opener() # Header info initializer
opener.addheaders = [('User-agent', UserAgent().ie)] # User-Agent info
req.install_opener(opener) # Insert header info
base = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query=" # naver image search URL
quote = rep.quote_plus("신소율") # search word
url = base + quote # combine basic URL + search word
response = req.urlopen(url) # request
# error handling(being reated to making folder)
save_path = "image" # image save folder at current path
try:
if not (os.path.isdir(save_path)): # check if existing folder is
os.makedirs(os.path.join(save_path)) # if not, make folder
except OSError as e:
print("folder creation failed!")
print("folder name : {}".format(e.filename))
raise RuntimeError('System Exit!')
else:
print('folder is created!')
# bs4 initializer
soup = BeautifulSoup(response, "html.parser")
img_list = soup.select("div.img_area > a.thumb._thumb > img")
# download, numbering images
for i, img_list in enumerate(img_list, 1):
file_name = os.path.join(save_path, save_path + str(i) + '.png'); print('image name : {}'.format(file_name))
req.urlretrieve(img_list['data-source'], file_name)
print("download succeeded!")
folder is created!
image name : image\image1.png
image name : image\image2.png
image name : image\image3.png
image name : image\image4.png
image name : image\image5.png
image name : image\image6.png
image name : image\image7.png
image name : image\image8.png
image name : image\image9.png
image name : image\image10.png
image name : image\image11.png
image name : image\image12.png
image name : image\image13.png
image name : image\image14.png
image name : image\image15.png
image name : image\image16.png
image name : image\image17.png
image name : image\image18.png
image name : image\image19.png
image name : image\image20.png
image name : image\image21.png
image name : image\image22.png
image name : image\image23.png
image name : image\image24.png
image name : image\image25.png
image name : image\image26.png
image name : image\image27.png
image name : image\image28.png
image name : image\image29.png
image name : image\image30.png
image name : image\image31.png
image name : image\image32.png
image name : image\image33.png
image name : image\image34.png
image name : image\image35.png
image name : image\image36.png
image name : image\image37.png
image name : image\image38.png
image name : image\image39.png
image name : image\image40.png
image name : image\image41.png
image name : image\image42.png
image name : image\image43.png
image name : image\image44.png
image name : image\image45.png
image name : image\image46.png
image name : image\image47.png
image name : image\image48.png
image name : image\image49.png
image name : image\image50.png
download succeeded!
EX6, danawa log-in
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
# /login/form-data on developer tools
login_info = {
'redirectUrl': 'http://www.danawa.com/',
'loginMemberType': 'general',
'id': '',
'password': ''
}
# request headers on developer tools
request_headers = {
'User-Agent': UserAgent().chrome,
'Referer': 'https://auth.danawa.com/login?url=http%3A%2F%2Fcws.danawa.com%2Fpoint%2Findex.php'
}
with requests.session() as s:
# Request(try log-in)
response = s.post('https://auth.danawa.com/login', login_info, headers=request_headers)
# if log-in fail
if response.status_code != 200:
raise Exception('Login failed.')
# move page with session info after log-in
response = s.get('http://www.danawa.com/member/myPage.php', headers=request_headers)
# EUC-KR (if korean is not work)
# response.encoding = 'euc-kr'
# bs4 initializer
soup = BeautifulSoup(response.text, "html.parser")
# check whether log-in is sucessful
check_name = soup.find('p', class_="p_id")
if check_name is None:
raise Exception('Login failed. Wrong Password.')
else:
print('log-in is successful')
log-in is successful
Save data to DB
h5
import h5py
import numpy as np
f = h5py.File('input_big_data.h5','r') # load big_data
for i in f.keys():
info = f.get(i) # show information about big_data
print(info)
data = np.array(info) # show big_data
print(data)
List of posts followed by this article
Reference