import pymysql
#连接数据库
import requests
import lxml as lxml
from bs4 import BeautifulSoup
def vall(morau,k):
page = morau.text.split('=')[k]
page = page.split('}')
page = page[0].split('{')
return page[1]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
conn=pymysql.connect(
host = '127.0.0.1' # 连接名称,默认127.0.0.1
,user = 'root' # 用户名
,passwd='12345' # 密码
,port= 3306 # 端口,默认为3306
,db='arcticle' # 数据库名称
,charset='utf8' # 字符编码
)
cur = conn.cursor() # 生成游标对象
url = "http://openaccess.thecvf.com/WACV2021"
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser') #
#print(soup)
soup.a.contents == 'pdf'
#print(soup.a.contents)
pdfs = soup.findAll(name="a", text="pdf")
morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'})
#print(morau)
m=0#爬取第几条信息
ls1=[]
while m<len(morau):
info = {}#放在循环内部
info['page'] = vall(morau=morau[m], k=-1)
info['year'] = vall(morau=morau[m], k=-2)
info['month'] = vall(morau=morau[m], k=-3)
info['booktitle'] = vall(morau=morau[m], k=-4)
info['title'] = vall(morau=morau[m], k=-5)
info['author'] = vall(morau=morau[m], k=-6)
# print(info)
ls1.append(info)
#print(ls)
#print('------------------------------')
m+=1
#ls1.reverse()#原地翻转,ls=ls.reverse没有返回值
#print(pdfs)
lis = []
jianjie = ""
#n= len(ls1)-1
n=0
for item in ls1:
print(item)
print('-------------------------------------------------------------')
#print(ls[n])#越界,因为应该减1,从零开始
for i, pdf in enumerate(pdfs):
pdf_name = pdf["href"].split('/')[-1]
name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "")
link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html"
url1 = link
html1 = requests.get(url1)
soup1 = BeautifulSoup(html1.content, 'html.parser')
weizhi = soup1.find('div', attrs={'id': 'abstract'})
if weizhi:
jianjie = weizhi.get_text();
print("这是第" + str(i) + "条数据")
keyword = str(name).split('_')
keywords = ''
for k in range(len(keyword)):
if (k == 0):
keywords += keyword[k]
else:
keywords += ',' + keyword[k]
info = dict(ls1[n])
n=n+1;
info['longtitle'] = name
info['link'] = link
#print(link)
info['abstract'] = jianjie
info['keywords'] = keywords
print(info)
lis.append(info)
print(lis)
cursor = conn.cursor()
for i in range(len(lis)):
cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
print(cols) # '`name`, `age`'
val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
print(val_cols) # '%(name)s, %(age)s'
sql = "insert into lunwen(%s) values(%s)"
res_sql = sql % (cols, val_cols)
print(res_sql)
cursor.execute(res_sql, lis[i]) # 将字典a传入
conn.commit()
num = 1
print(num)
print("成功")
import pymysql
#连接数据库
import requests
import lxml as lxml
from bs4 import BeautifulSoup
def vall(morau,k):
page = morau.text.split('=')[k]
page = page.split('}')
page = page[0].split('{')
return page[1]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
conn=pymysql.connect(
host = '127.0.0.1' # 连接名称,默认127.0.0.1
,user = 'root' # 用户名
,passwd='12345' # 密码
,port= 3306 # 端口,默认为3306
,db='arcticle' # 数据库名称
,charset='utf8' # 字符编码
)
cur = conn.cursor() # 生成游标对象
url = "http://openaccess.thecvf.com/WACV2021"
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser') #
#print(soup)
soup.a.contents == 'pdf'
#print(soup.a.contents)
pdfs = soup.findAll(name="a", text="pdf")
morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'})
#print(morau)
m=0#爬取第几条信息
ls1=[]
while m<len(morau):
info = {}#放在循环内部
info['page'] = vall(morau=morau[m], k=-1)
info['year'] = vall(morau=morau[m], k=-2)
info['month'] = vall(morau=morau[m], k=-3)
info['booktitle'] = vall(morau=morau[m], k=-4)
info['title'] = vall(morau=morau[m], k=-5)
info['author'] = vall(morau=morau[m], k=-6)
# print(info)
ls1.append(info)
#print(ls)
#print('------------------------------')
m+=1
#ls1.reverse()#原地翻转,ls=ls.reverse没有返回值
#print(pdfs)
lis = []
jianjie = ""
#n= len(ls1)-1
n=0
for item in ls1:
print(item)
print('-------------------------------------------------------------')
#print(ls[n])#越界,因为应该减1,从零开始
for i, pdf in enumerate(pdfs):
pdf_name = pdf["href"].split('/')[-1]
name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "")
link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html"
url1 = link
html1 = requests.get(url1)
soup1 = BeautifulSoup(html1.content, 'html.parser')
weizhi = soup1.find('div', attrs={'id': 'abstract'})
if weizhi:
jianjie = weizhi.get_text();
print("这是第" + str(i) + "条数据")
keyword = str(name).split('_')
keywords = ''
for k in range(len(keyword)):
if (k == 0):
keywords += keyword[k]
else:
keywords += ',' + keyword[k]
info = dict(ls1[n])
n=n+1;
info['longtitle'] = name
info['link'] = link
#print(link)
info['abstract'] = jianjie
info['keywords'] = keywords
print(info)
lis.append(info)
print(lis)
cursor = conn.cursor()
for i in range(len(lis)):
cols = ", ".join('`{}`'.format(k) for k in lis[i].keys())
print(cols) # '`name`, `age`'
val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys())
print(val_cols) # '%(name)s, %(age)s'
sql = "insert into lunwen(%s) values(%s)"
res_sql = sql % (cols, val_cols)
print(res_sql)
cursor.execute(res_sql, lis[i]) # 将字典a传入
conn.commit()
num = 1
print(num)
print("成功")