6626070
2997924

AI01, Practical data handling

Back to the previous pagepage management
List of posts to read before reading this article


Contents


Datasets

datasets

MNIST

https://github.com/myleott/mnist_png





File I/O

json





Image

jpg

$ pip install matplotlib
import matplotlib.pyplot as plt
from matplotlib import image

img = image.imread('input_image.jpg')   # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.jpg')         # save image

png

$ pip install matplotlib
$ pip install pillow
import matplotlib.pyplot as plt
from matplotlib import image

img = image.imread('input_image.png', 0)   # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.png')            # save image





Table

import pandas as pd

df = pd.read_csv('input_table.csv')    # load table
df.to_excel('output_table.xlsx')       # save table





Text

with open('input_text.txt','r') as f:  # load text
    text = f.read()
with open('output.txt','w') as f:      # save text
    f.write(text)




Sound

URL

from gtts import gTTS
import os

text = "Global warming is the long-term rise in the average temperature of the Earth's climate system"
language = "en"

speech = gTTS(text = text, lang = language, slow = False)
speech.save("text.mp3")
os.system("start text.mp3")





Load data from WEB

image

Developer tools

F12 : Elements(Inspector, Ctrl + Shift + c), Networks

/robots.txt

image




Scraping : basic

urllib

API
installation

$ pip install urllib3
$ pip install fake-useragent




urlretrieve(from urllib.request)
download file

from urllib.request import urlretrieve

# from : file url
img_url = 'https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png'
html_url = 'https://www.google.com/'

# to : path
img_save_path = r'S:\workspace\2020-01-19\winscp.jpg'
html_save_path = r'S:\workspace\2020-01-19\index.html'

# download file
img_file, img_header = urlretrieve(img_url,img_save_path); print(img_header)
html_file, html_header = urlretrieve(html_url, html_save_path); print(html_header)
OUTPUT

image image image image


handling error




urlopen(from urllib.request)
response

from urllib.request import urlopen

file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
response = urlopen(file_url)

print('header_info : {}'.format(response.info()))
print('http_status_code : {}'.format(response.getcode()))
print('geturl : {}'.format(response.geturl()))
print('status : {}'.format(response.status))
print('headers : {}'.format(response.getheaders()))
print('contents : {}'.format(response.read(10)))                          # response binary data, response.content in module 'requests'
print('contents decode: {}'.format(response.read(10).decode('utf-8')))    # response data, response.text in module 'requests'

image


save file as an object on python

from urllib.request import urlopen

# from : file url
# to : path
file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
save_path = r"S:\workspace\2020-01-22\winscp.jpg"

# save file as an object on python
response = urlopen(file_url)
header_info = response.info()
http_status_code = response.getcode()

# download file
contents = response.read()
with open(save_path, 'wb') as c:
    c.write(contents)
OUTPUT

image image


handling error
import urllib.request as req
from urllib.error import URLError, HTTPError

# from : file url
target_url = ["https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png",
              "https://google.com"]

# to : path
path_list = [r"S:\workspace\2020-01-22\winscp.jpg",
             r"S:\workspace\2020-01-22\index.html"]

# download file
for i, url in enumerate(target_url):
    try:
        response = req.urlopen(url)
        contents = response.read()
        print('---------------------------------------------------')
        print('Header Info-{} : {}'.format(i, response.info()))
        print('HTTP Status Code : {}'.format(response.getcode()))
        print('---------------------------------------------------')
        
        with open(path_list[i], 'wb') as c:
            c.write(contents)

    except HTTPError as e:
        print("Download failed.")
        print('HTTPError Code : ', e.code)

    except URLError as e:
        print("Download failed.")
        print('URL Error Reason : ', e.reason)

    else:
        print()
        print("Download Succeed.")




requests

API

installation

$ pip install requests
$ pip install lxml
$ pip install cssselect




Request methods : GET

import requests

response = requests.get("https://www.naver.com")

print(response.text)          # response data, response.read().decode('utf-8') in module 'urlopen'            
print(response.content)       # response binary data, response.read() in module 'urlopen'
print(response.headers)       # header
print(response.status_code)   # status code
print(response.url)           # url
print(response.ok)            # ok
print(response.encoding)      # encoding

image image

SUPPLEMENT, response.text

It can be used response.iter_lines instead of method ‘response.text’.

import requests

response = requests.get("https://www.naver.com")

#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
    print(line)    

image


with session
import requests

session = requests.Session()
response = session.get("https://www.naver.com")

print(response.text)
print(response.content)
print(response.status_code)
print(response.url)
print(response.ok)
print(response.encoding)

session.close()

or

import requests

with requests.Session() as session:
    response = session.get("https://www.naver.com")
    
    print(response.text)
    print(response.content)
    print(response.status_code)
    print(response.url)
    print(response.ok)
    print(response.encoding)

with cookies, headers
import requests

response1 = requests.get("https://httpbin.org/cookies", cookies={'name':'kim'})
response2 = requests.get("https://httpbin.org", headers={'user-agent':'nice-man_1.0.0_win10_ram16_home_chrome'})

print(response1, response1.text)
print(response2, response2.text)

image

another way carring cookies
import requests

response = requests.get('https://httpbin.org/cookies')
print(response.text)

jar = requests.cookies.RequestsCookieJar()
jar.set('name', 'niceman', domain='httpbin.org', path='/cookies')
response = requests.get('http://httpbin.org/cookies', cookies=jar)
print(response.text)
{
  "cookies": {}
}

{
  "cookies": {
    "name": "niceman"
  }
}


with timeout
import requests

response = requests.get('https://github.com', timeout=10)
print(response.text)

image





with json

import requests

response = requests.get('https://jsonplaceholder.typicode.com/posts/1')

print('.headers : \n',response.headers)
print('.text : \n',response.text)
print('.json() : \n', response.json())
print('.json().keys() : \n', response.json().keys())
print('.json().values() : \n',response.json().values())

image


import requests
import json

response = requests.get('http://httpbin.org/stream/100', stream=True)

#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
    b = json.loads(line); print(b)    # type(line) = str, type(b) = dict

    for k, v in b.items():
        print("Key: {}, Values: {}".format(k, v))

image




with lxml
with cssselect

css_selectors
image

import requests
import lxml.html

response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)

for i in root.cssselect('.api_list .api_item a.api_link'):
    # i.text_content(), i.get('attr')
    
    url = i.get('href')
    name = i.cssselect('.api_logo')[0].get('alt');
    
    print(name, url)
OUTPUT

image image


SUPPLEMENT
response = requests.get('https://www.naver.com/')
print(response)
print(response.content)
<Response [200]>
b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'


with xpath

import requests
import lxml.html

response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)
root.make_links_absolute(response.url)

for i in root.xpath('//ul[@class="api_list"]/li[@class="api_item"]/a[@class="api_link"]'):
    url = i.get('href')
    name = i.xpath('./img')[0].get('alt')

    print(name, url)
OUTPUT

image image


SUPPLEMENT
response = requests.get('https://www.naver.com/')
print(response)
print(response.content)
print(response.url)
<Response [200]>
b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'
https://www.naver.com/




Another request methods : POST, DELETE, PUT:UPDATE, REPLACE (FETCH : UPDATE, MODIFY)
image

import requests

response = requests.post('http://httpbin.org/post', data={'kim':'stellar'})
print(response.text)
print(response.headers)

image

import requests

payload1 = {'name': 'kim', 'pay': 'true'}
payload2 = (('name', 'park'), ('pay', 'false'))

response1 = requests.post('http://httpbin.org/post', data=payload1)
response2 = requests.post('http://httpbin.org/post', data=payload2)

print(response1.text)
print(response2.text)

image


import requests

response = requests.put('http://httpbin.org/put', data={'data': '{"name": "Kim", "grade": "A"}'})
print(response.text)

image




import requests

response = requests.delete('http://httpbin.org/delete')
print(response.text)

image

import requests

response = requests.delete('https://jsonplaceholder.typicode.com/posts/1')
print(response.text)

image




BeautifulSoup

API

installation

$ pip install beautifulsoup4




Basic

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')
print(soup.prettify())
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <h1>
   this is h1 area
  </h1>
  <h2>
   this is h2 area
  </h2>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
  </p>
  <p class="story">
   story...
  </p>
 </body>
</html>




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

h1 = soup.html.body.h1; print(h1, h1.string)    # h1 tag
p = soup.html.body.p; print(p, p.string)         # p tag
<h1>this is h1 area</h1>, this is h1 area
<p class="title"><b>The Dormouse's story</b></p>, The Dormouse's story




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

p = soup.html.body.p; print('p', p)
p2 = p.next_sibling.next_sibling; print('p2', p2)
p3 = p.next_sibling.next_sibling.next_sibling.next_sibling; print('p3', p3)
p4 = p.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling; print('p4', p4)
p <p class="title"><b>The Dormouse's story</b></p>
p2 <p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
p3 <p class="story">story...</p>
p4 None




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

p = soup.html.body.p
p2 = p.next_sibling.next_sibling

print(list(p2.next_elements))
for i in p2.next_elements:
    print(i)
['Once upon a time there were three little sisters\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 'Lacie', '\n', <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', '\n', '\n', <p class="story">story...</p>, 'story...', '\n', '\n', '\n']
Once upon a time there were three little sisters

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie


<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie


<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie




<p class="story">story...</p>
story...










FIND
find_all

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", class_='sister')
print(tag_a)

for i in tag_a:
    print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
Elsie Elsie
Lacie Lacie
Tillie Tillie




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", string=["Elsie","Tillie"], id="link1")
print(tag_a)

for i in tag_a:
    print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
Elsie Elsie




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", limit=2)
print(tag_a)

for i in tag_a:
    print(i.text, i.string)
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
Elsie Elsie
Lacie Lacie




find

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find("a")   # the first tag that was found
print(tag_a)
print(tag_a.text, tag_a.string)
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie Elsie




from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find("a", {"class": "sister", "data-io": "link3"})    # multiple condition
print(tag_a)
print(tag_a.text, tag_a.string)
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie Tillie




SELECT
select_one

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

select_b = soup.select_one("p.title > b")
select_idlink1 = soup.select_one("a#link1")
select_valuelink3 = soup.select_one("a[data-io='link3']")

print(select_b, select_b.string)
print(select_idlink1, select_idlink1.string)
print(select_valuelink3, select_valuelink3.string)
<b>The Dormouse's story</b> The Dormouse's story
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> Elsie
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a> Tillie




select

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

select_a = soup.select("p.story > a")
select_a2 = soup.select("p.story > a:nth-of-type(2)")
select_classstory = soup.select("p.story")

print(select_a)
print(select_a[0])
print(select_a[1])
print(select_a[2])
print(select_a[0]['href'])
print(select_a[1]['href'])
print(select_a[2]['href'])
print()

print(select_a2)
print(select_a2[0])
print(select_a2[0]['id'])
print()

print(select_classstory)
print(select_classstory[0])
print(select_classstory[1])
print(select_classstory[0]['class'])
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
link2

[<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>, <p class="story">story...</p>]
<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
['story']




Selenium

API

Installation

$ pip install selenium




Access web(with web driver)
web drivers(chrome, firefox, phantomjs) official download links

image without option

from selenium import webdriver

browser = webdriver.Chrome('webdriver/chromedriver.exe')  # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()

with option(headless)

from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('webdriver/chromedriver.exe', options=chrome_options)  # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()

result without option

image

web driver method
print(browser.page_source)
print(browser.session_id)
print(browser.title)
print(browser.current_url)
print(browser.get_cookies())
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="ko"><head>
<meta charset="utf-8" />
<meta name="Referrer" content="origin" />
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=1100" />
<meta name="apple-mobile-web-app-title" content="NAVER" />
<meta name="robots" content="index,nofollow" />
<meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" />
...
...
...

                if (window.addEventListener) {
                        window.addEventListener("load", function() { loadJS(); }, true);
                } else if (window.attachEvent) {
                        window.attachEvent("onload", loadJS);
                } else {
                        window.onload = loadJS;
                }

        </script>


</body></html>
aa015a79b7acddf96892e138b1e75e31
NAVER
https://www.naver.com/
[{'domain': '.naver.com', 'expiry': 1612652898, 'httpOnly': False, 'name': 'NRTK', 'path': '/', 'secure': False, 'value': 'ag#all_gr#1_ma#-2_si#0_en#0_sp#0'}, {'domain': '.naver.com', 'expiry': 2524640401.665735, 'httpOnly': False, 'name': 'NNB', 'path': '/', 'secure': True, 'value': 'EFLEUDPB5U6V4'}, {'domain': 'www.naver.com', 'expiry': 1581203297.710821, 'httpOnly': True, 'name': 'PM_CK_loc', 'path': '/', 'secure': False, 'value': 'd2d101bc3885853d3f553a325db5c09b55091e808f79a0dfdb0fb274ee3cfd30'}]




Search keyword & screen shot

from selenium import webdriver

browser = webdriver.Chrome('webdriver/chromedriver.exe')   # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.daum.net')

# search keyword
element = browser.find_element_by_css_selector('div.inner_search > input.tf_keyword')
element.send_keys('lion')          # search word(input keyword)
element.submit()                   # form submit

# screen shot
browser.save_screenshot("website_ch1.png")             # saving way 1
browser.get_screenshot_as_file("website_ch2.png")      # saving way 2
browser.quit()

image




Click(with Explicitly wait)

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

# Explicitly wait
WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)

# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
    if not v.find('div', class_='ad_header'):
        # product name, image, price
        print(v.select('p.prod_name > a')[0].text.strip())
        print(v.select('a.thumb_link > img')[0]['src'])
        print(v.select('p.price_sect > a')[0].text.strip())

browser.quit()
Click(with Implicitly wait)
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

# Implicitly wait
time.sleep(3); browser.find_element_by_xpath('//*[@id="dlMaker_simple"]/dd/div[2]/button[1]').click()
time.sleep(2); browser.find_element_by_xpath('//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label').click()
time.sleep(3)

# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
    if not v.find('div', class_='ad_header'):
        # product name, image, price
        print(v.select('p.prod_name > a')[0].text.strip())
        print(v.select('a.thumb_link > img')[0]['src'])
        print(v.select('p.price_sect > a')[0].text.strip())

browser.quit()

Application(click page-number)
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)


cur_page_num = 1; target_crawl_num = 5
while cur_page_num <= target_crawl_num:
    # bs4 initializer
    soup = BeautifulSoup(browser.page_source, "html.parser")
    pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
    
    print('Current Page : {}'.format(cur_page_num))
    for v in pro_list:
        if not v.find('div', class_='ad_header'):
            # product name, image, price
            print(v.select('p.prod_name > a')[0].text.strip())
            print(v.select('a.thumb_link > img')[0]['src'])
            print(v.select('p.price_sect > a')[0].text.strip())
    cur_page_num += 1   # next page
    if cur_page_num > target_crawl_num:
        print('Crawling Succeed.')
        break
    
    WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
    time.sleep(4)

browser.quit()




xlsxwriter

$ pip install xlsxwriter
import urllib.request as req
from io import BytesIO
import xlsxwriter

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

workbook = xlsxwriter.Workbook("crawling_result.xlsx")
worksheet = workbook.add_worksheet()

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)

cur_page_num = 1; target_crawl_num = 5
ins_cnt = 1  # excel row number

while cur_page_num <= target_crawl_num:
    # bs4 initializer
    soup = BeautifulSoup(browser.page_source, "html.parser")
    pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
    
    print('Current Page : {}'.format(cur_page_num))
    for v in pro_list:
        if not v.find('div', class_='ad_header'):
            # product name, price
            prod_name = v.select('p.prod_name > a')[0].text.strip()
            prod_price = v.select('p.price_sect > a')[0].text.strip()
            # save excel(text)
            worksheet.write('A%s' % ins_cnt, prod_name)
            worksheet.write('B%s' % ins_cnt, prod_price)

            """
            # product image
            img_data = BytesIO(req.urlopen(v.select('a.thumb_link > img')[0]['data-original']).read())
            
            # save excel(image)
            worksheet.insert_image('C%s' % ins_cnt, prod_name, {'image_data': img_data})
            """

            ins_cnt += 1   # next row
    cur_page_num += 1      # next page
    if cur_page_num > target_crawl_num:
        print('Crawling Succeed.')
        break

    WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
    time.sleep(4)

browser.quit()
workbook.close()
Making excel file
import urllib.request as req
from io import BytesIO
import xlsxwriter

workbook = xlsxwriter.Workbook("excel.xlsx")
worksheet = workbook.add_worksheet()

img_data = BytesIO(req.urlopen('http://img.danawa.com/prod_img/500000/866/250/img/5250866_1.jpg?shrink=160:160&_v=20180713162212').read())

worksheet.write('A1', 'prod_name')
worksheet.write('B1', 'prod_price')
worksheet.insert_image('C1', 'prod_name', {'image_data': img_data})

workbook.close()





Scraping : advanced

installation

pip install scrapy
pip install pypiwin32




Scrapy framework

image

$ scrapy startproject [project_name]

$ cd [project_name]
$ scrapy genspider [spider_name] [Crawling_URL]
$ scrapy crawl [spider_name]

$ cd [project_name]/spiders
$ scrapy runspider [spider_name].py




parse
image

response.css
getall()



get()



extract()



extract_first()



response.xpath
getall()



get()



extract()



extract_first()



Shell

$ scrapy shell
$ scrapy shell [crawling_url]
$ scrapy shell [crawling_url] --set="ROBOTSTXT_OBEY=False"




Spider




Selectors




Items




Exports




Settings




Pipeline




Scrapy project





Example for scraping

EX1, encar

encar

from urllib.request import urlopen
from urllib.parse import urlparse

# with urlopen
response_1 = urlopen("http://www.encar.com/")
print('type : {}'.format(type(response_1)))
print("geturl : {}".format(response_1.geturl()))
print("status : {}".format(response_1.status))
print("headers : {}".format(response_1.getheaders()))
print("getcode : {}".format(response_1.getcode()))
print("read : {}".format(response_1.read(1).decode('utf-8')))

# with urlparse
response_2 = urlparse('http://www.encar.co.kr?test=test')
print('total parse : {}'.format(response_2))
print('partial parse : {}'.format(response_2.query))

image




EX2, ipify

ipify

import urllib.request
from urllib.parse import urlparse

# request
API = "https://api.ipify.org"            # some request url
values = {'format': 'json'}              # It is also possible to use text, jsonp instead of json 
params = urllib.parse.urlencode(values)  # get parameter by encoding
url = API + "?" + params                 # request url

# response
data = urllib.request.urlopen(url).read() # read response data
text = data.decode("utf-8")               # decode read data
print('response : {}'.format(text))
SUPPLEMENT
values = {'format': 'json'}              # It is also possible to use text, jsonp instead of json 
params = urllib.parse.urlencode(values)  # get parameter by encoding
print(params)
format=json

image




EX3, mois

mois

import urllib.request
import urllib.parse

API = "http://www.mois.go.kr/gpms/view/jsp/rss/rss.jsp"

params = []
for num in [1001, 1012, 1013, 1014]:
    params.append(dict(ctxCd=num))

for i in params:
    param = urllib.parse.urlencode(i)
    url = API + "?" + param
    res_data = urllib.request.urlopen(url).read()
    contents = res_data.decode("utf-8")
    print(contents)
SUPPLEMENT
for i in params:
    print(i)
    param = urllib.parse.urlencode(i)
    print(param)
{'ctxCd': 1001}
ctxCd=1001
{'ctxCd': 1012}
ctxCd=1012
{'ctxCd': 1013}
ctxCd=1013
{'ctxCd': 1014}
ctxCd=1014

image




EX4, daum finance

daum finance

import json
import urllib.request as req
from fake_useragent import UserAgent

ua = UserAgent()
headers = {'User-Agent' : ua.ie,
           'referer' : 'https://finance.daum.net/'}
url = "https://finance.daum.net/api/search/ranks?limit=10"

res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')
rank_json = json.loads(res)['data']   # str -> json

for elm in rank_json:
    print('rank : {}, trade price : {}, name : {}'.format(elm['rank'], elm['tradePrice'], elm['name']), )
SUPPLEMENT
from fake_useragent import UserAgent

ua = UserAgent()
print(ua.ie)
print(ua.msie)
print(ua.chrome)
print(ua.safari)
print(ua.random)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36
Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36




res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')
{"data":[{"rank":1,"rankChange":0,"symbolCode":"A005930","code":"KR7005930003","name":"삼성전자","tradePrice":57100,"change":"FALL","changePrice":2000,"changeRate":0.0338409475,"chartSlideImage":null,"isNew":false},{"rank":2,"rankChange":2,"symbolCode":"A308170","code":"KR7308170000","name":"센트랄모텍","tradePrice":42450,"change":"RISE","changePrice":5750,"changeRate":0.1566757493,"chartSlideImage":null,"isNew":false},{"rank":3,"rankChange":5,"symbolCode":"A068270","code":"KR7068270008","name":"셀트리온","tradePrice":166500,"change":"FALL","changePrice":4500,"changeRate":0.0263157895,"chartSlideImage":null,"isNew":false},{"rank":4,"rankChange":-1,"symbolCode":"A226440","code":"KR7226440006","name":"한송네오텍","tradePrice":1930,"change":"RISE","changePrice":270,"changeRate":0.1626506024,"chartSlideImage":null,"isNew":false},{"rank":5,"rankChange":0,"symbolCode":"A028300","code":"KR7028300002","name":"에이치엘비","tradePrice":96300,"change":"FALL","changePrice":3500,"changeRate":0.0350701403,"chartSlideImage":null,"isNew":false},{"rank":6,"rankChange":-4,"symbolCode":"A215600","code":"KR7215600008","name":"신라젠","tradePrice":13750,"change":"FALL","changePrice":850,"changeRate":0.0582191781,"chartSlideImage":null,"isNew":false},{"rank":7,"rankChange":0,"symbolCode":"A011000","code":"KR7011000007","name":"진원생명과학","tradePrice":5590,"change":"RISE","changePrice":240,"changeRate":0.0448598131,"chartSlideImage":null,"isNew":true},{"rank":8,"rankChange":-1,"symbolCode":"A091990","code":"KR7091990002","name":"셀트리온헬스케어","tradePrice":55600,"change":"FALL","changePrice":500,"changeRate":0.008912656,"chartSlideImage":null,"isNew":false},{"rank":9,"rankChange":0,"symbolCode":"A045060","code":"KR7045060001","name":"오공","tradePrice":7920,"change":"RISE","changePrice":230,"changeRate":0.0299089727,"chartSlideImage":null,"isNew":false},{"rank":10,"rankChange":-4,"symbolCode":"A036540","code":"KR7036540003","name":"SFA반도체","tradePrice":6190,"change":"RISE","changePrice":340,"changeRate":0.0581196581,"chartSlideImage":null,"isNew":false}]}

image




naver image search

import os
import urllib.parse as rep
import urllib.request as req
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

opener = req.build_opener()                            # Header info initializer
opener.addheaders = [('User-agent', UserAgent().ie)]   # User-Agent info
req.install_opener(opener)                             # Insert header info

base = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query="   # naver image search URL
quote = rep.quote_plus("신소율")                                               # search word
url = base + quote                                                             # combine basic URL + search word

response = req.urlopen(url) # request

# error handling(being reated to making folder)
save_path = "image"  # image save folder at current path
try:
    if not (os.path.isdir(save_path)):         # check if existing folder is
        os.makedirs(os.path.join(save_path))   # if not, make folder
except OSError as e:
        print("folder creation failed!")
        print("folder name : {}".format(e.filename))
        raise RuntimeError('System Exit!')
else:
    print('folder is created!')


# bs4 initializer
soup = BeautifulSoup(response, "html.parser")
img_list = soup.select("div.img_area > a.thumb._thumb > img")

# download, numbering images
for i, img_list in enumerate(img_list, 1):
    file_name = os.path.join(save_path, save_path + str(i) + '.png'); print('image name : {}'.format(file_name))
    req.urlretrieve(img_list['data-source'], file_name)

print("download succeeded!")
folder is created!
image name : image\image1.png
image name : image\image2.png
image name : image\image3.png
image name : image\image4.png
image name : image\image5.png
image name : image\image6.png
image name : image\image7.png
image name : image\image8.png
image name : image\image9.png
image name : image\image10.png
image name : image\image11.png
image name : image\image12.png
image name : image\image13.png
image name : image\image14.png
image name : image\image15.png
image name : image\image16.png
image name : image\image17.png
image name : image\image18.png
image name : image\image19.png
image name : image\image20.png
image name : image\image21.png
image name : image\image22.png
image name : image\image23.png
image name : image\image24.png
image name : image\image25.png
image name : image\image26.png
image name : image\image27.png
image name : image\image28.png
image name : image\image29.png
image name : image\image30.png
image name : image\image31.png
image name : image\image32.png
image name : image\image33.png
image name : image\image34.png
image name : image\image35.png
image name : image\image36.png
image name : image\image37.png
image name : image\image38.png
image name : image\image39.png
image name : image\image40.png
image name : image\image41.png
image name : image\image42.png
image name : image\image43.png
image name : image\image44.png
image name : image\image45.png
image name : image\image46.png
image name : image\image47.png
image name : image\image48.png
image name : image\image49.png
image name : image\image50.png
download succeeded!




EX6, danawa log-in

image image image

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

# /login/form-data on developer tools
login_info = {
    'redirectUrl': 'http://www.danawa.com/',
    'loginMemberType': 'general',
    'id': '',
    'password': ''
}

# request headers on developer tools
request_headers = {
    'User-Agent': UserAgent().chrome,
    'Referer': 'https://auth.danawa.com/login?url=http%3A%2F%2Fcws.danawa.com%2Fpoint%2Findex.php'
}


with requests.session() as s:
    # Request(try log-in)
    response = s.post('https://auth.danawa.com/login', login_info, headers=request_headers)
    
    # if log-in fail
    if response.status_code != 200:
        raise Exception('Login failed.')

    # move page with session info after log-in
    response = s.get('http://www.danawa.com/member/myPage.php', headers=request_headers)

    # EUC-KR (if korean is not work)
    # response.encoding = 'euc-kr'

    # bs4 initializer
    soup = BeautifulSoup(response.text, "html.parser")

    # check whether log-in is sucessful
    check_name = soup.find('p', class_="p_id")
    
    if check_name is None:
        raise Exception('Login failed. Wrong Password.')
    else:
        print('log-in is successful')
log-in is successful





Save data to DB





h5

import h5py
import numpy as np

f = h5py.File('input_big_data.h5','r')    # load big_data
for i in f.keys():                        
    info = f.get(i)                       # show information about big_data
    print(info)                           
    
    data = np.array(info)                 # show big_data
    print(data)






List of posts followed by this article


Reference


handling error