AI01, Practical data handling

Back to the previous page ｜page management
List of posts to read before reading this article

Datasets
- MNIST
File I/O
Load data from WEB
Save data to DB
h5

Datasets

datasets

MNIST

https://github.com/myleott/mnist_png

File I/O

json

Image

jpg

$ pip install matplotlib

import matplotlib.pyplot as plt
from matplotlib import image

img = image.imread('input_image.jpg')   # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.jpg')         # save image

png

$ pip install matplotlib
$ pip install pillow

import matplotlib.pyplot as plt
from matplotlib import image

img = image.imread('input_image.png', 0)   # load image
plt.imshow(img); plt.show()
plt.figsave('output_image.png')            # save image

Table

import pandas as pd

df = pd.read_csv('input_table.csv')    # load table
df.to_excel('output_table.xlsx')       # save table

Text

with open('input_text.txt','r') as f:  # load text
    text = f.read()
with open('output.txt','w') as f:      # save text
    f.write(text)

Sound

URL

from gtts import gTTS
import os

text = "Global warming is the long-term rise in the average temperature of the Earth's climate system"
language = "en"

speech = gTTS(text = text, lang = language, slow = False)
speech.save("text.mp3")
os.system("start text.mp3")

Load data from WEB

Developer tools

F12 : Elements(Inspector, Ctrl + Shift + c), Networks

/robots.txt

Scraping : basic

urllib

API
installation

$ pip install urllib3
$ pip install fake-useragent

urlretrieve(from urllib.request)
download file

from urllib.request import urlretrieve

# from : file url
img_url = 'https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png'
html_url = 'https://www.google.com/'

# to : path
img_save_path = r'S:\workspace\2020-01-19\winscp.jpg'
html_save_path = r'S:\workspace\2020-01-19\index.html'

# download file
img_file, img_header = urlretrieve(img_url,img_save_path); print(img_header)
html_file, html_header = urlretrieve(html_url, html_save_path); print(html_header)

OUTPUT

handling error

urlopen(from urllib.request)
response

from urllib.request import urlopen

file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
response = urlopen(file_url)

print('header_info : {}'.format(response.info()))
print('http_status_code : {}'.format(response.getcode()))
print('geturl : {}'.format(response.geturl()))
print('status : {}'.format(response.status))
print('headers : {}'.format(response.getheaders()))
print('contents : {}'.format(response.read(10)))                          # response binary data, response.content in module 'requests'
print('contents decode: {}'.format(response.read(10).decode('utf-8')))    # response data, response.text in module 'requests'

save file as an object on python

from urllib.request import urlopen

# from : file url
# to : path
file_url = "https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png"
save_path = r"S:\workspace\2020-01-22\winscp.jpg"

# save file as an object on python
response = urlopen(file_url)
header_info = response.info()
http_status_code = response.getcode()

# download file
contents = response.read()
with open(save_path, 'wb') as c:
    c.write(contents)

OUTPUT

handling error

import urllib.request as req
from urllib.error import URLError, HTTPError

# from : file url
target_url = ["https://user-images.githubusercontent.com/52376448/69004181-481c3d80-0952-11ea-98b4-823969ceb0c3.png",
              "https://google.com"]

# to : path
path_list = [r"S:\workspace\2020-01-22\winscp.jpg",
             r"S:\workspace\2020-01-22\index.html"]

# download file
for i, url in enumerate(target_url):
    try:
        response = req.urlopen(url)
        contents = response.read()
        print('---------------------------------------------------')
        print('Header Info-{} : {}'.format(i, response.info()))
        print('HTTP Status Code : {}'.format(response.getcode()))
        print('---------------------------------------------------')
        
        with open(path_list[i], 'wb') as c:
            c.write(contents)

    except HTTPError as e:
        print("Download failed.")
        print('HTTPError Code : ', e.code)

    except URLError as e:
        print("Download failed.")
        print('URL Error Reason : ', e.reason)

    else:
        print()
        print("Download Succeed.")

requests

API

installation

$ pip install requests
$ pip install lxml
$ pip install cssselect

Request methods : GET

import requests

response = requests.get("https://www.naver.com")

print(response.text)          # response data, response.read().decode('utf-8') in module 'urlopen'            
print(response.content)       # response binary data, response.read() in module 'urlopen'
print(response.headers)       # header
print(response.status_code)   # status code
print(response.url)           # url
print(response.ok)            # ok
print(response.encoding)      # encoding

SUPPLEMENT, response.text

It can be used response.iter_lines instead of method ‘response.text’.

import requests

response = requests.get("https://www.naver.com")

#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
    print(line)    

with session

import requests

session = requests.Session()
response = session.get("https://www.naver.com")

print(response.text)
print(response.content)
print(response.status_code)
print(response.url)
print(response.ok)
print(response.encoding)

session.close()

import requests

with requests.Session() as session:
    response = session.get("https://www.naver.com")
    
    print(response.text)
    print(response.content)
    print(response.status_code)
    print(response.url)
    print(response.ok)
    print(response.encoding)

with cookies, headers

import requests

response1 = requests.get("https://httpbin.org/cookies", cookies={'name':'kim'})
response2 = requests.get("https://httpbin.org", headers={'user-agent':'nice-man_1.0.0_win10_ram16_home_chrome'})

print(response1, response1.text)
print(response2, response2.text)

another way carring cookies

import requests

response = requests.get('https://httpbin.org/cookies')
print(response.text)

jar = requests.cookies.RequestsCookieJar()
jar.set('name', 'niceman', domain='httpbin.org', path='/cookies')
response = requests.get('http://httpbin.org/cookies', cookies=jar)
print(response.text)

{
  "cookies": {}
}

{
  "cookies": {
    "name": "niceman"
  }
}

with timeout

import requests

response = requests.get('https://github.com', timeout=10)
print(response.text)

with json

import requests

response = requests.get('https://jsonplaceholder.typicode.com/posts/1')

print('.headers : \n',response.headers)
print('.text : \n',response.text)
print('.json() : \n', response.json())
print('.json().keys() : \n', response.json().keys())
print('.json().values() : \n',response.json().values())

import requests
import json

response = requests.get('http://httpbin.org/stream/100', stream=True)

#if response.encoding is None: response.encoding = 'UTF-8'
for line in response.iter_lines(decode_unicode=True):
    b = json.loads(line); print(b)    # type(line) = str, type(b) = dict

    for k, v in b.items():
        print("Key: {}, Values: {}".format(k, v))

with lxml
with cssselect

css_selectors

import requests
import lxml.html

response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)

for i in root.cssselect('.api_list .api_item a.api_link'):
    # i.text_content(), i.get('attr')
    
    url = i.get('href')
    name = i.cssselect('.api_logo')[0].get('alt');
    
    print(name, url)

OUTPUT

SUPPLEMENT

response = requests.get('https://www.naver.com/')
print(response)
print(response.content)

<Response [200]>

b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'

with xpath

import requests
import lxml.html

response = requests.get('https://www.naver.com/')
root = lxml.html.fromstring(response.content)
root.make_links_absolute(response.url)

for i in root.xpath('//ul[@class="api_list"]/li[@class="api_item"]/a[@class="api_link"]'):
    url = i.get('href')
    name = i.xpath('./img')[0].get('alt')

    print(name, url)

OUTPUT

SUPPLEMENT

response = requests.get('https://www.naver.com/')
print(response)
print(response.content)
print(response.url)

<Response [200]>

b'<!doctype html>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta name="Referrer" content="origin">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=1100">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<meta name="robots" content="index,nofollow"/>\n<meta name="description" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84 \xeb\xa9\x94\xec\x9d\xb8\xec\x97\x90\xec\x84\x9c \xeb\x8b\xa4\xec\x96\x91\xed\x95\x9c \xec\xa0\x95\xeb\xb3\xb4\xec\x99\x80 \xec\x9c\xa0\xec\x9a\xa9\xed\x95\x9c \xec\xbb\xa8\xed\x85\x90\xec\xb8\xa0\xeb\xa5\xbc \xeb\xa7\x8c\xeb\x82\x98 \xeb\xb3\xb4\xec\x84\xb8\xec\x9a\x94"/>\n<meta property="og:title" content="\xeb\x84\xa4\xec\x9d\xb4\xeb\xb2\x84">\n<meta property="og:url" content="https://www.naver.com/">\n<meta property="og:image" content="https://s.pstatic.net/static/www/
...
...
...
\n\t\t} else if (window.attachEvent) { \n\t\t\twindow.attachEvent("onload", loadJS);\n\t\t} else {\n\t\t\twindow.onload = loadJS;\n\t\t}\n\t\t\n\t</script>\n</body>\n</html>\n'

https://www.naver.com/

Another request methods : POST, DELETE, PUT:UPDATE, REPLACE (FETCH : UPDATE, MODIFY)

import requests

response = requests.post('http://httpbin.org/post', data={'kim':'stellar'})
print(response.text)
print(response.headers)

import requests

payload1 = {'name': 'kim', 'pay': 'true'}
payload2 = (('name', 'park'), ('pay', 'false'))

response1 = requests.post('http://httpbin.org/post', data=payload1)
response2 = requests.post('http://httpbin.org/post', data=payload2)

print(response1.text)
print(response2.text)

import requests

response = requests.put('http://httpbin.org/put', data={'data': '{"name": "Kim", "grade": "A"}'})
print(response.text)

import requests

response = requests.delete('http://httpbin.org/delete')
print(response.text)

import requests

response = requests.delete('https://jsonplaceholder.typicode.com/posts/1')
print(response.text)

BeautifulSoup

API

installation

$ pip install beautifulsoup4

Basic

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <h1>
   this is h1 area
  </h1>
  <h2>
   this is h2 area
  </h2>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
  </p>
  <p class="story">
   story...
  </p>
 </body>
</html>

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

h1 = soup.html.body.h1; print(h1, h1.string)    # h1 tag
p = soup.html.body.p; print(p, p.string)         # p tag

<h1>this is h1 area</h1>, this is h1 area
<p class="title"><b>The Dormouse's story</b></p>, The Dormouse's story

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

p = soup.html.body.p; print('p', p)
p2 = p.next_sibling.next_sibling; print('p2', p2)
p3 = p.next_sibling.next_sibling.next_sibling.next_sibling; print('p3', p3)
p4 = p.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling; print('p4', p4)

p <p class="title"><b>The Dormouse's story</b></p>
p2 <p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
p3 <p class="story">story...</p>
p4 None

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

p = soup.html.body.p
p2 = p.next_sibling.next_sibling

print(list(p2.next_elements))
for i in p2.next_elements:
    print(i)

['Once upon a time there were three little sisters\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 'Lacie', '\n', <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', '\n', '\n', <p class="story">story...</p>, 'story...', '\n', '\n', '\n']
Once upon a time there were three little sisters

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie

<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie

<p class="story">story...</p>
story...

FIND
find_all

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", class_='sister')
print(tag_a)

for i in tag_a:
    print(i.text, i.string)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
Elsie Elsie
Lacie Lacie
Tillie Tillie

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", string=["Elsie","Tillie"], id="link1")
print(tag_a)

for i in tag_a:
    print(i.text, i.string)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
Elsie Elsie

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find_all("a", limit=2)
print(tag_a)

for i in tag_a:
    print(i.text, i.string)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
Elsie Elsie
Lacie Lacie

find

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find("a")   # the first tag that was found
print(tag_a)
print(tag_a.text, tag_a.string)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie Elsie

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

tag_a = soup.find("a", {"class": "sister", "data-io": "link3"})    # multiple condition
print(tag_a)
print(tag_a.text, tag_a.string)

<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie Tillie

SELECT
select_one

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

select_b = soup.select_one("p.title > b")
select_idlink1 = soup.select_one("a#link1")
select_valuelink3 = soup.select_one("a[data-io='link3']")

print(select_b, select_b.string)
print(select_idlink1, select_idlink1.string)
print(select_valuelink3, select_valuelink3.string)

<b>The Dormouse's story</b> The Dormouse's story
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> Elsie
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a> Tillie

select

from bs4 import BeautifulSoup

HTML = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

soup = BeautifulSoup(HTML, 'html.parser')

select_a = soup.select("p.story > a")
select_a2 = soup.select("p.story > a:nth-of-type(2)")
select_classstory = soup.select("p.story")

print(select_a)
print(select_a[0])
print(select_a[1])
print(select_a[2])
print(select_a[0]['href'])
print(select_a[1]['href'])
print(select_a[2]['href'])
print()

print(select_a2)
print(select_a2[0])
print(select_a2[0]['id'])
print()

print(select_classstory)
print(select_classstory[0])
print(select_classstory[1])
print(select_classstory[0]['class'])

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
link2

[<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>, <p class="story">story...</p>]
<p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
['story']

Selenium

API

Installation

$ pip install selenium

Access web(with web driver)
web drivers(chrome, firefox, phantomjs) official download links

without option

from selenium import webdriver

browser = webdriver.Chrome('webdriver/chromedriver.exe')  # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()

with option(headless)

from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('webdriver/chromedriver.exe', options=chrome_options)  # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.naver.com')
browser.quit()

result without option

web driver method

print(browser.page_source)
print(browser.session_id)
print(browser.title)
print(browser.current_url)
print(browser.get_cookies())

<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="ko"><head>
<meta charset="utf-8" />
<meta name="Referrer" content="origin" />
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=1100" />
<meta name="apple-mobile-web-app-title" content="NAVER" />
<meta name="robots" content="index,nofollow" />
<meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요" />
...
...
...

                if (window.addEventListener) {
                        window.addEventListener("load", function() { loadJS(); }, true);
                } else if (window.attachEvent) {
                        window.attachEvent("onload", loadJS);
                } else {
                        window.onload = loadJS;
                }

        </script>


</body></html>
aa015a79b7acddf96892e138b1e75e31
NAVER
https://www.naver.com/
[{'domain': '.naver.com', 'expiry': 1612652898, 'httpOnly': False, 'name': 'NRTK', 'path': '/', 'secure': False, 'value': 'ag#all_gr#1_ma#-2_si#0_en#0_sp#0'}, {'domain': '.naver.com', 'expiry': 2524640401.665735, 'httpOnly': False, 'name': 'NNB', 'path': '/', 'secure': True, 'value': 'EFLEUDPB5U6V4'}, {'domain': 'www.naver.com', 'expiry': 1581203297.710821, 'httpOnly': True, 'name': 'PM_CK_loc', 'path': '/', 'secure': False, 'value': 'd2d101bc3885853d3f553a325db5c09b55091e808f79a0dfdb0fb274ee3cfd30'}]

Search keyword & screen shot

from selenium import webdriver

browser = webdriver.Chrome('webdriver/chromedriver.exe')   # Driver path is important
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)            # maximize_window(), minimize_window()
browser.get('https://www.daum.net')

# search keyword
element = browser.find_element_by_css_selector('div.inner_search > input.tf_keyword')
element.send_keys('lion')          # search word(input keyword)
element.submit()                   # form submit

# screen shot
browser.save_screenshot("website_ch1.png")             # saving way 1
browser.get_screenshot_as_file("website_ch2.png")      # saving way 2
browser.quit()

Click(with Explicitly wait)

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

# Explicitly wait
WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)

# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
    if not v.find('div', class_='ad_header'):
        # product name, image, price
        print(v.select('p.prod_name > a')[0].text.strip())
        print(v.select('a.thumb_link > img')[0]['src'])
        print(v.select('p.price_sect > a')[0].text.strip())

browser.quit()

Click(with Implicitly wait)

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

# Implicitly wait
time.sleep(3); browser.find_element_by_xpath('//*[@id="dlMaker_simple"]/dd/div[2]/button[1]').click()
time.sleep(2); browser.find_element_by_xpath('//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label').click()
time.sleep(3)

# bs4 initializer
soup = BeautifulSoup(browser.page_source, "html.parser")
pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
for v in pro_list:
    if not v.find('div', class_='ad_header'):
        # product name, image, price
        print(v.select('p.prod_name > a')[0].text.strip())
        print(v.select('a.thumb_link > img')[0]['src'])
        print(v.select('p.price_sect > a')[0].text.strip())

browser.quit()

Application(click page-number)

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)


cur_page_num = 1; target_crawl_num = 5
while cur_page_num <= target_crawl_num:
    # bs4 initializer
    soup = BeautifulSoup(browser.page_source, "html.parser")
    pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
    
    print('Current Page : {}'.format(cur_page_num))
    for v in pro_list:
        if not v.find('div', class_='ad_header'):
            # product name, image, price
            print(v.select('p.prod_name > a')[0].text.strip())
            print(v.select('a.thumb_link > img')[0]['src'])
            print(v.select('p.price_sect > a')[0].text.strip())
    cur_page_num += 1   # next page
    if cur_page_num > target_crawl_num:
        print('Crawling Succeed.')
        break
    
    WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
    time.sleep(4)

browser.quit()

xlsxwriter

$ pip install xlsxwriter

import urllib.request as req
from io import BytesIO
import xlsxwriter

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

workbook = xlsxwriter.Workbook("crawling_result.xlsx")
worksheet = workbook.add_worksheet()

chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./webdriver/chromedriver.exe', options=chrome_options)
browser.implicitly_wait(5)
browser.set_window_size(1920, 1280)  # maximize_window(), minimize_window()
browser.get('http://prod.danawa.com/list/?cate=112758&15main_11_02')

WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="dlMaker_simple"]/dd/div[2]/button[1]'))).click()
WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.XPATH, '//*[@id="selectMaker_simple_priceCompare_A"]/li[14]/label'))).click()
time.sleep(3)

cur_page_num = 1; target_crawl_num = 5
ins_cnt = 1  # excel row number

while cur_page_num <= target_crawl_num:
    # bs4 initializer
    soup = BeautifulSoup(browser.page_source, "html.parser")
    pro_list = soup.select('div.main_prodlist.main_prodlist_list > ul > li')
    
    print('Current Page : {}'.format(cur_page_num))
    for v in pro_list:
        if not v.find('div', class_='ad_header'):
            # product name, price
            prod_name = v.select('p.prod_name > a')[0].text.strip()
            prod_price = v.select('p.price_sect > a')[0].text.strip()
            # save excel(text)
            worksheet.write('A%s' % ins_cnt, prod_name)
            worksheet.write('B%s' % ins_cnt, prod_price)

            """
            # product image
            img_data = BytesIO(req.urlopen(v.select('a.thumb_link > img')[0]['data-original']).read())
            
            # save excel(image)
            worksheet.insert_image('C%s' % ins_cnt, prod_name, {'image_data': img_data})
            """

            ins_cnt += 1   # next row
    cur_page_num += 1      # next page
    if cur_page_num > target_crawl_num:
        print('Crawling Succeed.')
        break

    WebDriverWait(browser, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.number_wrap > a:nth-child({})'.format(cur_page_num)))).click()
    time.sleep(4)

browser.quit()
workbook.close()

Making excel file

import urllib.request as req
from io import BytesIO
import xlsxwriter

workbook = xlsxwriter.Workbook("excel.xlsx")
worksheet = workbook.add_worksheet()

img_data = BytesIO(req.urlopen('http://img.danawa.com/prod_img/500000/866/250/img/5250866_1.jpg?shrink=160:160&_v=20180713162212').read())

worksheet.write('A1', 'prod_name')
worksheet.write('B1', 'prod_price')
worksheet.insert_image('C1', 'prod_name', {'image_data': img_data})

workbook.close()

Scraping : advanced

installation

pip install scrapy
pip install pypiwin32

Scrapy framework

$ scrapy startproject [project_name]

$ cd [project_name]
$ scrapy genspider [spider_name] [Crawling_URL]
$ scrapy crawl [spider_name]

$ cd [project_name]/spiders
$ scrapy runspider [spider_name].py

parse

response.css
getall()

get()

extract()

extract_first()

response.xpath
getall()

get()

extract()

extract_first()

Shell

$ scrapy shell
$ scrapy shell [crawling_url]
$ scrapy shell [crawling_url] --set="ROBOTSTXT_OBEY=False"

Spider

Selectors

Items

Exports

Settings

Pipeline

Scrapy project

Example for scraping

EX1, encar

encar

from urllib.request import urlopen
from urllib.parse import urlparse

# with urlopen
response_1 = urlopen("http://www.encar.com/")
print('type : {}'.format(type(response_1)))
print("geturl : {}".format(response_1.geturl()))
print("status : {}".format(response_1.status))
print("headers : {}".format(response_1.getheaders()))
print("getcode : {}".format(response_1.getcode()))
print("read : {}".format(response_1.read(1).decode('utf-8')))

# with urlparse
response_2 = urlparse('http://www.encar.co.kr?test=test')
print('total parse : {}'.format(response_2))
print('partial parse : {}'.format(response_2.query))

EX2, ipify

ipify

import urllib.request
from urllib.parse import urlparse

# request
API = "https://api.ipify.org"            # some request url
values = {'format': 'json'}              # It is also possible to use text, jsonp instead of json 
params = urllib.parse.urlencode(values)  # get parameter by encoding
url = API + "?" + params                 # request url

# response
data = urllib.request.urlopen(url).read() # read response data
text = data.decode("utf-8")               # decode read data
print('response : {}'.format(text))

SUPPLEMENT

values = {'format': 'json'}              # It is also possible to use text, jsonp instead of json 
params = urllib.parse.urlencode(values)  # get parameter by encoding
print(params)

format=json

EX3, mois

mois

import urllib.request
import urllib.parse

API = "http://www.mois.go.kr/gpms/view/jsp/rss/rss.jsp"

params = []
for num in [1001, 1012, 1013, 1014]:
    params.append(dict(ctxCd=num))

for i in params:
    param = urllib.parse.urlencode(i)
    url = API + "?" + param
    res_data = urllib.request.urlopen(url).read()
    contents = res_data.decode("utf-8")
    print(contents)

SUPPLEMENT

for i in params:
    print(i)
    param = urllib.parse.urlencode(i)
    print(param)

{'ctxCd': 1001}
ctxCd=1001
{'ctxCd': 1012}
ctxCd=1012
{'ctxCd': 1013}
ctxCd=1013
{'ctxCd': 1014}
ctxCd=1014

EX4, daum finance

daum finance

import json
import urllib.request as req
from fake_useragent import UserAgent

ua = UserAgent()
headers = {'User-Agent' : ua.ie,
           'referer' : 'https://finance.daum.net/'}
url = "https://finance.daum.net/api/search/ranks?limit=10"

res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')
rank_json = json.loads(res)['data']   # str -> json

for elm in rank_json:
    print('rank : {}, trade price : {}, name : {}'.format(elm['rank'], elm['tradePrice'], elm['name']), )

SUPPLEMENT

from fake_useragent import UserAgent

ua = UserAgent()
print(ua.ie)
print(ua.msie)
print(ua.chrome)
print(ua.safari)
print(ua.random)

Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)
Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36
Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36

res = req.urlopen(req.Request(url, headers=headers)).read().decode('utf-8')

{"data":[{"rank":1,"rankChange":0,"symbolCode":"A005930","code":"KR7005930003","name":"삼성전자","tradePrice":57100,"change":"FALL","changePrice":2000,"changeRate":0.0338409475,"chartSlideImage":null,"isNew":false},{"rank":2,"rankChange":2,"symbolCode":"A308170","code":"KR7308170000","name":"센트랄모텍","tradePrice":42450,"change":"RISE","changePrice":5750,"changeRate":0.1566757493,"chartSlideImage":null,"isNew":false},{"rank":3,"rankChange":5,"symbolCode":"A068270","code":"KR7068270008","name":"셀트리온","tradePrice":166500,"change":"FALL","changePrice":4500,"changeRate":0.0263157895,"chartSlideImage":null,"isNew":false},{"rank":4,"rankChange":-1,"symbolCode":"A226440","code":"KR7226440006","name":"한송네오텍","tradePrice":1930,"change":"RISE","changePrice":270,"changeRate":0.1626506024,"chartSlideImage":null,"isNew":false},{"rank":5,"rankChange":0,"symbolCode":"A028300","code":"KR7028300002","name":"에이치엘비","tradePrice":96300,"change":"FALL","changePrice":3500,"changeRate":0.0350701403,"chartSlideImage":null,"isNew":false},{"rank":6,"rankChange":-4,"symbolCode":"A215600","code":"KR7215600008","name":"신라젠","tradePrice":13750,"change":"FALL","changePrice":850,"changeRate":0.0582191781,"chartSlideImage":null,"isNew":false},{"rank":7,"rankChange":0,"symbolCode":"A011000","code":"KR7011000007","name":"진원생명과학","tradePrice":5590,"change":"RISE","changePrice":240,"changeRate":0.0448598131,"chartSlideImage":null,"isNew":true},{"rank":8,"rankChange":-1,"symbolCode":"A091990","code":"KR7091990002","name":"셀트리온헬스케어","tradePrice":55600,"change":"FALL","changePrice":500,"changeRate":0.008912656,"chartSlideImage":null,"isNew":false},{"rank":9,"rankChange":0,"symbolCode":"A045060","code":"KR7045060001","name":"오공","tradePrice":7920,"change":"RISE","changePrice":230,"changeRate":0.0299089727,"chartSlideImage":null,"isNew":false},{"rank":10,"rankChange":-4,"symbolCode":"A036540","code":"KR7036540003","name":"SFA반도체","tradePrice":6190,"change":"RISE","changePrice":340,"changeRate":0.0581196581,"chartSlideImage":null,"isNew":false}]}

EX5, naver image search

naver image search

import os
import urllib.parse as rep
import urllib.request as req
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

opener = req.build_opener()                            # Header info initializer
opener.addheaders = [('User-agent', UserAgent().ie)]   # User-Agent info
req.install_opener(opener)                             # Insert header info

base = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query="   # naver image search URL
quote = rep.quote_plus("신소율")                                               # search word
url = base + quote                                                             # combine basic URL + search word

response = req.urlopen(url) # request

# error handling(being reated to making folder)
save_path = "image"  # image save folder at current path
try:
    if not (os.path.isdir(save_path)):         # check if existing folder is
        os.makedirs(os.path.join(save_path))   # if not, make folder
except OSError as e:
        print("folder creation failed!")
        print("folder name : {}".format(e.filename))
        raise RuntimeError('System Exit!')
else:
    print('folder is created!')


# bs4 initializer
soup = BeautifulSoup(response, "html.parser")
img_list = soup.select("div.img_area > a.thumb._thumb > img")

# download, numbering images
for i, img_list in enumerate(img_list, 1):
    file_name = os.path.join(save_path, save_path + str(i) + '.png'); print('image name : {}'.format(file_name))
    req.urlretrieve(img_list['data-source'], file_name)

print("download succeeded!")

folder is created!
image name : image\image1.png
image name : image\image2.png
image name : image\image3.png
image name : image\image4.png
image name : image\image5.png
image name : image\image6.png
image name : image\image7.png
image name : image\image8.png
image name : image\image9.png
image name : image\image10.png
image name : image\image11.png
image name : image\image12.png
image name : image\image13.png
image name : image\image14.png
image name : image\image15.png
image name : image\image16.png
image name : image\image17.png
image name : image\image18.png
image name : image\image19.png
image name : image\image20.png
image name : image\image21.png
image name : image\image22.png
image name : image\image23.png
image name : image\image24.png
image name : image\image25.png
image name : image\image26.png
image name : image\image27.png
image name : image\image28.png
image name : image\image29.png
image name : image\image30.png
image name : image\image31.png
image name : image\image32.png
image name : image\image33.png
image name : image\image34.png
image name : image\image35.png
image name : image\image36.png
image name : image\image37.png
image name : image\image38.png
image name : image\image39.png
image name : image\image40.png
image name : image\image41.png
image name : image\image42.png
image name : image\image43.png
image name : image\image44.png
image name : image\image45.png
image name : image\image46.png
image name : image\image47.png
image name : image\image48.png
image name : image\image49.png
image name : image\image50.png
download succeeded!

EX6, danawa log-in

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

# /login/form-data on developer tools
login_info = {
    'redirectUrl': 'http://www.danawa.com/',
    'loginMemberType': 'general',
    'id': '',
    'password': ''
}

# request headers on developer tools
request_headers = {
    'User-Agent': UserAgent().chrome,
    'Referer': 'https://auth.danawa.com/login?url=http%3A%2F%2Fcws.danawa.com%2Fpoint%2Findex.php'
}


with requests.session() as s:
    # Request(try log-in)
    response = s.post('https://auth.danawa.com/login', login_info, headers=request_headers)
    
    # if log-in fail
    if response.status_code != 200:
        raise Exception('Login failed.')

    # move page with session info after log-in
    response = s.get('http://www.danawa.com/member/myPage.php', headers=request_headers)

    # EUC-KR (if korean is not work)
    # response.encoding = 'euc-kr'

    # bs4 initializer
    soup = BeautifulSoup(response.text, "html.parser")

    # check whether log-in is sucessful
    check_name = soup.find('p', class_="p_id")
    
    if check_name is None:
        raise Exception('Login failed. Wrong Password.')
    else:
        print('log-in is successful')

log-in is successful

Save data to DB

h5

import h5py
import numpy as np

f = h5py.File('input_big_data.h5','r')    # load big_data
for i in f.keys():                        
    info = f.get(i)                       # show information about big_data
    print(info)                           
    
    data = np.array(info)                 # show big_data
    print(data)

List of posts followed by this article

Reference

handling error

6626070
2997924

AI01, Practical data handling

Contents

Datasets

MNIST

File I/O

json

Image

Table

Text

Sound

Load data from WEB

Scraping : basic

urllib

requests

BeautifulSoup

Selenium

Scraping : advanced

Scrapy framework

Shell

Spider

Selectors

Items

Exports

Settings

Pipeline

Scrapy project

Example for scraping

EX1, encar

EX2, ipify

EX3, mois

EX4, daum finance

EX5, naver image search

EX6, danawa log-in

Save data to DB

h5

6626070 2997924

AI01, Practical data handling

Contents

Datasets

MNIST

File I/O

json

Image

Table

Text

Sound

Load data from WEB

Scraping : basic

urllib

requests

BeautifulSoup

Selenium

Scraping : advanced

Scrapy framework

Shell

Spider

Selectors

Items

Exports

Settings

Pipeline

Scrapy project

Example for scraping

EX1, encar

EX2, ipify

EX3, mois

EX4, daum finance

EX5, naver image search

EX6, danawa log-in

Save data to DB

h5

6626070
2997924