셀레니움을 통해 html에 있는 데이터 정보를 가져올 수 있는 것을 확인했지만, 작업 효율(메모리 , 시간)을 위해 두 기술을 섞어서 사용하자
bs4 & 셀레니움 두 가지 기능을 통해 가져오자
from selenium import webdriver
from bs4 import BeautifulSoup as BeautifulSoup
url = 'https://www.melon.com/chart/index.htm'
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
bs4 = BeautifulSoup(html, 'html.parser')
song_parts = bs4.select('tbody > tr')
for song in (song_parts):
singer = song.find('span', class_= 'checkEllipsis').get_text().replace('\n', '')
title = song.find('div', class_= 'ellipsis rank01').get_text().replace('\n', '')
album = song.find('div', class_= 'ellipsis rank03').get_text().replace('\n', '').replace("'", "\'")
like = song.find('button', class_ = 'button_etc like').get_text().replace('\n', '').replace('좋아요총건수', '').replace(',', '')
print(singer, title, album, like)
print()
driver.quit()
성공
#vscode에서 DB 테이블에 데이터 입력 준비
import pymysql
conn = pymysql.connect(
host='localhost',
user="root",
database= "mydatabase",
password= "**********",
charset='utf8')
sql = "INSERT INTO melon_top_100 (title, singer, album, likes) VALUES (%s, %s, %s, %s)"
최종 코드
import pymysql
from selenium import webdriver
from bs4 import BeautifulSoup as BeautifulSoup
conn = pymysql.connect(
host='localhost',
user="root",
database= "mydatabase",
password= "**********",
charset='utf8')
url = 'https://www.melon.com/chart/index.htm'
driver = webdriver.Chrome()
driver.get(url)
html = driver.page_source
bs4 = BeautifulSoup(html, 'html.parser')
song_parts = bs4.select('tbody > tr')
sql = "INSERT INTO melon_top_100 (title, singer, album, likes) VALUES (%s, %s, %s, %s)"
with conn:
with conn.cursor() as cur:
for song in (song_parts):
singer = song.find('span', class_= 'checkEllipsis').get_text().replace('\n', '')
title = song.find('div', class_= 'ellipsis rank01').get_text().replace('\n', '')
album = song.find('div', class_= 'ellipsis rank03').get_text().replace('\n', '').replace("'", "\'")
like = song.find('button', class_ = 'button_etc like').get_text().replace('\n', '').replace('좋아요총건수', '').replace(',', '')
cur.execute(sql, (title, singer, album, like))
conn.commit()
driver.quit()
결과
조회도 잘 된다
SELECT id, singer, likes FROM mydatabase.melon_top_100 ORDER BY likes DESC LIMIT 20;
'ETL' 카테고리의 다른 글
#1. [Mini Project] 멜론 TOP100 + DB (ETL _ bs4 & Selenium) (0) | 2023.01.16 |
---|
댓글