python爬取cvpr论文2021


import pymysql
#连接数据库
import requests
import lxml as lxml
from bs4 import BeautifulSoup

def vall(morau,k):
    page = morau.text.split('=')[k]
    page = page.split('}')
    page = page[0].split('{')
    return page[1]
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
conn=pymysql.connect(
host = '127.0.0.1'  # 连接名称,默认127.0.0.1
 ,user = 'root' # 用户名
,passwd='12345' # 密码
,port= 3306 # 端口,默认为3306
,db='arcticle' # 数据库名称
,charset='utf8' # 字符编码
)
cur = conn.cursor() # 生成游标对象
url = "http://openaccess.thecvf.com/WACV2021"
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser') #
#print(soup)
soup.a.contents == 'pdf'
#print(soup.a.contents)
pdfs = soup.findAll(name="a", text="pdf")
morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'})
#print(morau)
m=0#爬取第几条信息
ls1=[]
while m<len(morau):
    info = {}#放在循环内部
    info['page'] = vall(morau=morau[m], k=-1)
    info['year'] = vall(morau=morau[m], k=-2)
    info['month'] = vall(morau=morau[m], k=-3)
    info['booktitle'] = vall(morau=morau[m], k=-4)
    info['title'] = vall(morau=morau[m], k=-5)
    info['author'] = vall(morau=morau[m], k=-6)
     # print(info)
    ls1.append(info)
    #print(ls)
    #print('------------------------------')
    m+=1
#ls1.reverse()#原地翻转,ls=ls.reverse没有返回值
#print(pdfs)
lis = []

jianjie = ""
#n= len(ls1)-1
n=0
for item in ls1:
    print(item)
print('-------------------------------------------------------------')
#print(ls[n])#越界,因为应该减1,从零开始
for i, pdf in enumerate(pdfs):
    pdf_name = pdf["href"].split('/')[-1]
    name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "")
    link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html"
    url1 = link
    html1 = requests.get(url1)
    soup1 = BeautifulSoup(html1.content, 'html.parser')
    weizhi = soup1.find('div', attrs={'id': 'abstract'})
    if weizhi:
        jianjie = weizhi.get_text();
    print("这是第" + str(i) + "条数据")
    keyword = str(name).split('_')
    keywords = ''
    for k in range(len(keyword)):
        if (k == 0):
            keywords += keyword[k]
        else:
            keywords += ',' + keyword[k]
    info = dict(ls1[n])
    n=n+1;
    info['longtitle'] = name
    info['link'] = link
    #print(link)
    info['abstract'] = jianjie
    info['keywords'] = keywords
    print(info)
    lis.append(info)
print(lis)
cursor = conn.cursor()
for i in range(len(lis)):
    cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
    print(cols)  # '`name`, `age`'

    val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
    print(val_cols)  # '%(name)s, %(age)s'

    sql = "insert into lunwen(%s) values(%s)"
    res_sql = sql % (cols, val_cols)
    print(res_sql)

    cursor.execute(res_sql, lis[i])  # 将字典a传入
    conn.commit()
    num = 1
    print(num)
    print("成功")
import pymysql
#连接数据库
import requests
import lxml as lxml
from bs4 import BeautifulSoup

def vall(morau,k):
    page = morau.text.split('=')[k]
    page = page.split('}')
    page = page[0].split('{')
    return page[1]
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
conn=pymysql.connect(
host = '127.0.0.1'  # 连接名称,默认127.0.0.1
 ,user = 'root' # 用户名
,passwd='12345' # 密码
,port= 3306 # 端口,默认为3306
,db='arcticle' # 数据库名称
,charset='utf8' # 字符编码
)
cur = conn.cursor() # 生成游标对象
url = "http://openaccess.thecvf.com/WACV2021"
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser') #
#print(soup)
soup.a.contents == 'pdf'
#print(soup.a.contents)
pdfs = soup.findAll(name="a", text="pdf")
morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'})
#print(morau)
m=0#爬取第几条信息
ls1=[]
while m<len(morau):
    info = {}#放在循环内部
    info['page'] = vall(morau=morau[m], k=-1)
    info['year'] = vall(morau=morau[m], k=-2)
    info['month'] = vall(morau=morau[m], k=-3)
    info['booktitle'] = vall(morau=morau[m], k=-4)
    info['title'] = vall(morau=morau[m], k=-5)
    info['author'] = vall(morau=morau[m], k=-6)
     # print(info)
    ls1.append(info)
    #print(ls)
    #print('------------------------------')
    m+=1
#ls1.reverse()#原地翻转,ls=ls.reverse没有返回值
#print(pdfs)
lis = []

jianjie = ""
#n= len(ls1)-1
n=0
for item in ls1:
    print(item)
print('-------------------------------------------------------------')
#print(ls[n])#越界,因为应该减1,从零开始
for i, pdf in enumerate(pdfs):
    pdf_name = pdf["href"].split('/')[-1]
    name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "")
    link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html"
    url1 = link
    html1 = requests.get(url1)
    soup1 = BeautifulSoup(html1.content, 'html.parser')
    weizhi = soup1.find('div', attrs={'id': 'abstract'})
    if weizhi:
        jianjie = weizhi.get_text();
    print("这是第" + str(i) + "条数据")
    keyword = str(name).split('_')
    keywords = ''
    for k in range(len(keyword)):
        if (k == 0):
            keywords += keyword[k]
        else:
            keywords += ',' + keyword[k]
    info = dict(ls1[n])
    n=n+1;
    info['longtitle'] = name
    info['link'] = link
    #print(link)
    info['abstract'] = jianjie
    info['keywords'] = keywords
    print(info)
    lis.append(info)
print(lis)
cursor = conn.cursor()
for i in range(len(lis)):
    cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
    print(cols)  # '`name`, `age`'

    val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
    print(val_cols)  # '%(name)s, %(age)s'

    sql = "insert into lunwen(%s) values(%s)"
    res_sql = sql % (cols, val_cols)
    print(res_sql)

    cursor.execute(res_sql, lis[i])  # 将字典a传入
    conn.commit()
    num = 1
    print(num)
    print("成功")

相关