tree of words : Read : example python code of web crawling with selenium and beautifulsoup

id: 12736 nodeId: 12736 type: General point: 27.0 linkPoint: .0 maker: cella permission: linkable made at: 2020.03.24 05:09 edited at: 2022.01.16 12:44

example python code of web crawling with selenium and beautifulsoup

from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By

def get_news_content(driver, publisher):
writer, date, content = '', None, ''
if publisher == '조선일보':
writer = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0]

date_str = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:]
date = datetime.strptime(date_str, '%Y.%m.%d %H:%M') # like 2021.10.16 14:48

i = 1
while True:
try:
content += driver.find_element(By.XPATH, '//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text
i+=1
except:
break
elif publisher == 'MBC뉴스':
writer = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[1]/div/span[2]/a').text

date_str = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[3]/div[1]/span[1]').text[3:]
date = datetime.strptime(date_str, '%Y-%m-%d %H:%M') # like 2021-10-16 14:48

content = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[2]/div[5]').text
content = content.split('<b style')[0].replace('<br>', '')

return writer, date, content

def get_news_heads(driver, url):
driver.implicitly_wait(3)
driver.get(url)
html = driver.page_source # get elements
soup = BeautifulSoup(html, 'html5lib')

#articles = soup.select('main > div > div > div > div > article')
divs = soup.select('main > div > div > div')
for div in divs:
article = div.select_one('div > article')
if article == None:
article = div.select_one('article')
if article == None:
continue

titleA = article.select_one('h3 > a')
title = titleA.text
news_url = 'https://news.google.com' + str(titleA.get('href'))

publisherA = article.select_one('div > div > a')
publisher = publisherA.get_text()

print('---------------------------------')
print(publisher, title, news_url)

driver.implicitly_wait(3)
driver.get(news_url)
writer, date, content = get_news_content(driver, publisher)

print('================================================')
print(publisher,', writer : ', writer)
print(' date :', date)
print(' content :', content)
print('================================================')

url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page

options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")

driver = webdriver.Chrome(options=options)

try:
get_news_heads(driver, url)

finally:
driver.quit()

Return to example python code of web crawling with selenium and beautifulsoup