''' 爬取中国每个省份的大学名称和官网地址 ''' import requests from lxml import etree class School(object): def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } self.url = 'http://www.cnxiaoyuan.com/' # 省份学校 def province_school_url(self): province = list() response = requests.get(url=self.url, headers=self.headers) html = etree.HTML(response.content.decode('utf-8')) # 省份学校url li_list = html.xpath("//div[@id='homecate']/ul/li")[0:-3] for li in li_list: province_school_url = li.xpath("./a/@href") for province_school in province_school_url: province_school = 'http://www.cnxiaoyuan.com/' + province_school province.append(province_school) return province # 获取每个省份的学校的url def school_url(self, province): school_list = list() for school in province: response = requests.get(url=school, headers=self.headers) html = etree.HTML(response.content.decode('utf-8')) # 每个省份的学校title和url li_list = html.xpath("//ul[@class='sitelist']/li") for li in li_list: school_title = li.xpath("./div/h3/a/text()") school_url = li.xpath("./div/address/a/text()") school_list.append(school_url) school_list.append(school_title) print(school_title, school_url) if __name__ == '__main__': s = School() province = s.province_school_url() i = 0 while i < 21: s.school_url(province) i += 1