Python获取百度百科的⼈物详情和关系图谱的信息
import pandas as pd
import time
from selenium import webdriver
from urllib import parse
driver = webdriver.Firefox()
香蕉的画法# 获取⼀个⼈的详情页
def get_one_detail(url):
<(url)
# 姓名
temp = url.split('/')
name = parse.unquote(temp['item'.index(temp)+1])
# 个⼈描述
desc = driver.find_element_by_class_name('lemma-summary').text.split()
# 头像
try:
head_image = driver.find_element_by_class_name('summary-pic').find_element_by_tag_name('a').get_attribute('href') except:
head_image =''
裸眼3d技术
return[{'⼈物':name,'简介':''.join(desc),'头像':head_image}]
# 获取⼀个⼈的详情页和关系图
def get_one_detail_kg(name):
url ='baike.baidu/item/'+name
<(url)
虾仁炒黄瓜的做法# time.sleep(1)
relationship_list =[]
people_list =[]
# 个⼈描述
desc = driver.find_element_by_class_name('lemma-summary').text.split()
# 头像
try:
head_image = driver.find_element_by_class_name('summary-pic').find_element_by_tag_name('a').get_attribute('href') except:
head_image =''
# 与之相关的⼈
elements = driver.find_elements_by_class_name('lemma-relation-item')
# 相关⼈的名字,单独get,否则正在循环的element失效
kg_name =[]
for li in elements:
relationship = li.find_element_by_class_name('name').text
_name = li.find_element_by_class_name('title').text
祥林嫂原文
relationship_list.append({'⼈物A': _name,'关系': relationship,'⼈物B': name})
出类detail_url = li.find_element_by_tag_name('a').get_attribute('href')广州旅游攻略
kg_name.append(detail_url)
for detail_url in kg_name:
one_detail = get_one_detail(detail_url)
people_list.append(one_detail)
people_list.append({'⼈物':name,'简介':''.join(desc),'头像':head_image})
assert relationship_list!=None and people_list!=None
return relationship_list,people_list
relationship_list =[]
people_list =[]
names =['⼩明','⼩华']
for i in names:
print(i)
one_relationship_list,one_people_list = get_one_detail_kg(i)
relationship_list += one_relationship_list
people_list += one_people_list
people_list += one_people_list
df = pd.DataFrame(relationship_list) df.to_excel('⼈物关系.xls',index=False)
df = pd.DataFrame(people_list)
<_excel('⼈物.xls',index=False)