$ cat /proc/version
Linux version 4.14.171-136.231.amzn2.x86_64 (mockbuild@ip-10-0-1-138) (gcc version 7.3.1 20180712 (Red Hat 7.3.1-6) (GCC)) #1 SMP Thu Feb 27 20:22:48 UTC 2020
$ sudo vi /etc/yum.repos.d/google-chrome.repo
[google-chrome]
name=google-chrome
baseurl=https://dl.google.com/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google.com/linux/linux_signing_key.pub
$ sudo yum install google-chrome-stable
$ google-chrome --version
Google Chrome 97.0.4692.71
$ cd /tmp/
$ sudo wget https://chromedriver.storage.googleapis.com/97.0.4692.71/chromedriver_linux64.zip // <= version must match
$ sudo unzip chromedriver_linux64.zip
$ sudo mv chromedriver /usr/bin/chromedriver
$ chromedriver --version
$ pip3 install selenium
$ vi test.py
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
url = 'https://naver.com/'
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
text = driver.find_element_by_class_name('blind').text
print(text)
finally:
driver.quit()
$ python3 test.py
NAVER whale
$ pip3 install bs4 // <= used for parsing
$ pip3 install requests
$ pip3 install lxml
$ pip3 install html5lib
$ vi getnews.py
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
def get_news_info(driver, news_url, publisher):
date, article, writer = None,'', ''
if publisher == '조선일보':
driver.implicitly_wait(3)
driver.get(news_url)
writer = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0]
# date like 2021.10.16 14:48
date_time_obj = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:]
date = datetime.strptime(date_time_obj, '%Y.%m.%d %H:%M')
i = 1
while True:
try:
article += driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text
i+=1
except:
break
driver.quit()
print('================================================')
print(publisher,', writer : ', writer)
print(' date :', date)
print(' article :', article)
print('================================================')
return writer, article, date
def get_news_heads(driver):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
items = soup.select('div > article > h3 > a')
for idx, item in enumerate(items): # nth-child 는 nth-of-type 으로 바꾸어줘야 한다.
publisherA = soup.select('main > div.lBwEZb.BL5WZb.GndZbb > div:nth-of-type({0}) > div > div > article > div > div > a'.format(idx+1))
if len(publisherA) == 0: continue
publisher = publisherA[0].getText()
title = item.text
news_url = 'https://news.google.com' + str(item.get('href'))
print('---------------------------------')
print('IDX : ', idx, publisher, title , news_url)
get_news_info(driver, news_url, publisher)
url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")
driver = webdriver.Chrome(options=options)
try:
get_news_heads(driver)
finally:
driver.quit()