id: 12736    nodeId: 12736    type: General    point: 27.0    linkPoint: .0    maker: cella    permission: linkable    made at: 2020.03.24 05:09    edited at: 2022.01.16 12:44
example python code of web crawling with selenium and beautifulsoup


from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By

def get_news_content(driver, publisher):
writer, date, content = '', None, ''
if publisher == '조선일보':
writer = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0]

date_str = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:]
date = datetime.strptime(date_str, '%Y.%m.%d %H:%M') # like 2021.10.16 14:48

i = 1
while True:
try:
content += driver.find_element(By.XPATH, '//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text
i+=1
except:
break
elif publisher == 'MBC뉴스':
writer = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[1]/div/span[2]/a').text

date_str = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[3]/div[1]/span[1]').text[3:]
date = datetime.strptime(date_str, '%Y-%m-%d %H:%M') # like 2021-10-16 14:48

content = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[2]/div[5]').text
content = content.split('<b style')[0].replace('<br>', '')

return writer, date, content


def get_news_heads(driver, url):
driver.implicitly_wait(3)
driver.get(url)
html = driver.page_source # get elements
soup = BeautifulSoup(html, 'html5lib')

#articles = soup.select('main > div > div > div > div > article')
divs = soup.select('main > div > div > div')
for div in divs:
article = div.select_one('div > article')
if article == None:
article = div.select_one('article')
if article == None:
continue

titleA = article.select_one('h3 > a')
title = titleA.text
news_url = 'https://news.google.com' + str(titleA.get('href'))

publisherA = article.select_one('div > div > a')
publisher = publisherA.get_text()

print('---------------------------------')
print(publisher, title, news_url)

driver.implicitly_wait(3)
driver.get(news_url)
writer, date, content = get_news_content(driver, publisher)

print('================================================')
print(publisher,', writer : ', writer)
print(' date :', date)
print(' content :', content)
print('================================================')



url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page

options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")

driver = webdriver.Chrome(options=options)

try:
get_news_heads(driver, url)

finally:
driver.quit()

Return to example python code of web crawling with selenium and beautifulsoup