Python Selenium, bs4를 활용한 웹크롤링

 

chromedriver 설치

chrome 버전 확인
[설정] - [Chrome 정보]에서 확인 가능합니다



chrome 버전이 업그레이드 될때마다 버전에 맞는 driver를 맞춰주어야 합니다
최근 chromedriver 버전 : https://googlechromelabs.github.io/chrome-for-testing/#stable 

 

  • python - getChromeDriver.py
from selenium import webdriver
import chromedriver_autoinstaller
import os

# Check if chrome driver is installed or not
chrome_ver = chromedriver_autoinstaller.get_chrome_version().split('.')[0]

# driver_path = f'./{chrome_ver}/chromedriver.exe'
driver_path = 'chromedriver 파일 경로'

if os.path.exists(driver_path):
    print(f"chrom driver is insatlled: {driver_path}")

else:
    print(f"install the chrome driver(ver: {chrome_ver})")
    chromedriver_autoinstaller.install(True)

# Get driver and open url
driver = webdriver.Chrome()

 

  • python - 웹크롤링 실행파일
import sys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions as ex
import pyautogui
from bs4 import BeautifulSoup as bs
import getChromeDriver

# 웹크롤링 시작
driver = getChromeDriver.driver
url = 'url 입력'

driver.get(url)
time.sleep(3)

# find_element_by -> fine_element(BY.) 예시
# 로그인
eId = driver.find_element(By.XPATH, "//input[@name='os_username']").send_keys(id) # html 태그
ePw = driver.find_element(By.XPATH, "//input[@name='os_password']").send_keys(pw) # html 태그
loginBtn = driver.find_element(By.NAME, 'login').click()


# html parser (BeautifulSoup - bs4)
html = driver.page_source
soup = bs(html, "html.parser")
html = soup.select('div.user-content-block')  # html 태그

list1 = []
for text in html:
    list1.append(text.get_text())