네이버뉴스제목

requests #HTML 문서의 전체를 가져온다.

BeautifulSoup #HTML문서를 탐색해서 원하는 부분만 가져온다.

(vscode) beautifulsoup, 네이버뉴스검색 제목

Google Colaboratory

바탕화면 폴더 만들기(visual)

test.py 만들기

print("안녕")으로 파이썬 작동 확인

검색어 입력하여 블로그 열기

# (터미널에)"pip install bs4"로 beautifulsoup 라이브러리 설치

# (터미널) "pip install requests"

from bs4 import BeautifulSoup

from urllib.parse import quote

import urllib.request

site = 'https://search.naver.com/search.naver?nso=&where=blog&sm=tab_opt&query='

keyword = quote(input('검색할 단어:'))

url = site + keyword

print(url)

# (터미널에)"pip install bs4"로 beautifulsoup 라이브러리 설치

# (터미널) "pip install requests"

from bs4 import BeautifulSoup #beautifulsoup을 불러옴 그럼 아래에서는 포함된 기능을 사용할 수 있음

from urllib.request import urlopen #urllib.request라는 곳에서 urlopen 기능을 가져와 아래에서 사용하겠다.

with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:

#주소를 열어서 response에 담겠다.

#response = urlopen('https://en.wikipedia.org/wiki/Main_Page')과 동일

soup = BeautifulSoup(response, 'html.parser')

#위에서 담은 response를 가져와서 html.parser로 분석해서 soup에 담는다.

for anchor in soup.find_all('a'): #soup중에 find_all('a') 모든 a태그를 찾아서 anchor에 담는다.

print(anchor.get('href', '/')) #anchor에서 href(주소)를 get해서 출력

위키에 있는 beautifulsoup 예제로 실행 확인

네이버에서 "중학교 기술가정" 뉴스 검색 후

URL 획득

beautifulsoup 한글문서

select, 클래스로 찾기

(구조분석) 제목들은 a태드 안에 class="news_tit"

from bs4 import BeautifulSoup

from urllib.request import urlopen

with urlopen('https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query=%EC%A4%91%ED%95%99%EA%B5%90+%EA%B8%B0%EC%88%A0%EA%B0%80%EC%A0%95&oquery=%EA%B8%B0%EC%88%A0%EA%B0%80%EC%A0%95&tqi=hkyapwp0JXossFNLhSlssssst1w-261819') as response:

soup = BeautifulSoup(response, 'html.parser')

for anchor in soup.select("a.news_tit"): #soup중에 a태그 안에 news_tit클래스를 찾아 찾아서 anchor에 담는다.

print(anchor.get('title', '/')) #anchor에서 title(뉴스제목)을 get해서 출력 또는 anchor.get_text()로 뽑을 수도 있다.

구글에서 "파이썬 파일쓰기" 검색

# writedata.py

f = open("C:/doit/새파일.txt", 'w')

for i in range(1, 11):

data = "%d번째 줄입니다.\n" % i

f.write(data)

f.close()

코드 복사해 사용하기

https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EA%B8%B0%EC%88%A0%EA%B0%80%EC%A0%95&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=35&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start=1

https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EA%B8%B0%EC%88%A0%EA%B0%80%EC%A0%95&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=16&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start=11

https://search.naver.com/search.naver?where=news&sm=tab_pge&query=%EA%B8%B0%EC%88%A0%EA%B0%80%EC%A0%95&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=51&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start=21

URL 디코더

from bs4 import BeautifulSoup

from urllib.request import urlopen

import urllib.parse

index = input('어떤뉴스를 검색할까요?: ')

text = urllib.parse.quote_plus(index) #url에서 사용하는 문자로 바꿔줌

page = 1

count = 1

i = input('몇페이지까지 크롤링 할까요?: ')

lastpage = int(i)*10 - 9 #뉴스 첫페이지는 page = 1, 뉴스 2페이지는 page = 11, 뉴스 3페이지는 page = 21,....

f = open(index + "결과.txt", 'w')

while page < lastpage + 1:

url = f'https://search.naver.com/search.naver?where=news&sm=tab_pge&query={text}&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=12&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start={page}'