更新
This commit is contained in:
31
Python-100-Days/Day66-75/code/example04.py
Normal file
31
Python-100-Days/Day66-75/code/example04.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import re
|
||||
import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def main():
|
||||
headers = {'user-agent': 'Baiduspider'}
|
||||
proxies = {
|
||||
'http': 'http://122.114.31.177:808'
|
||||
}
|
||||
base_url = 'https://www.zhihu.com/'
|
||||
seed_url = urljoin(base_url, 'explore')
|
||||
resp = requests.get(seed_url,
|
||||
headers=headers,
|
||||
proxies=proxies)
|
||||
soup = BeautifulSoup(resp.text, 'lxml')
|
||||
href_regex = re.compile(r'^/question')
|
||||
link_set = set()
|
||||
for a_tag in soup.find_all('a', {'href': href_regex}):
|
||||
if 'href' in a_tag.attrs:
|
||||
href = a_tag.attrs['href']
|
||||
full_url = urljoin(base_url, href)
|
||||
link_set.add(full_url)
|
||||
print('Total %d question pages found.' % len(link_set))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user