Python 下用 Scrapy 采集知网期刊数据(四)

Python 下用 Scrapy 采集知网期刊数据(三)中采集了所需数据,接下来该进行的就是数据的存储和导出。

数据存储

  1. 创建 MySql 表 表格 表格
  2. 导入 pymysql 库 import pymysql
  3. 配置 MySql 在 setting.py 文件中写入下列配置代码:
    1
    2
    3
    4
    5
    6
    # start MySQL database configure setting
    MYSQL_HOST = 'localhost'
    MYSQL_DBNAME = '数据库名'
    MYSQL_USER = '登录名'
    MYSQL_PASSWD = '密码'
    # end of MySQL database configure setting
  4. 编写 item pipeline

    每个 item pipeline 组件(有时称之为 “Item Pipeline” )是实现了简单方法的Python类。他们接收到Item并通过它执行一些行为,同时也决定此Item是否继续通过pipeline,或是被丢弃而不再进行处理。

    以采集引证文献的 item pipeline 为例:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    class CnkiPipeline(object):
    def __init__(self):
    self.connect = pymysql.connect(
    host=settings.MYSQL_HOST,
    db=settings.MYSQL_DBNAME,
    user=settings.MYSQL_USER,
    passwd=settings.MYSQL_PASSWD,
    charset='utf8',
    use_unicode=True)
    self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
    if item.__class__ == Quote:
    try:
    self.cursor.execute("""select * from reference where literature_title = %s and quote_title = %s AND literature_author = %s
    AND literature_journalName = %s AND literature_time = %s""",
    (item["paper_title"], str(item['quote_title']), item['paper_author'],
    item['paper_journalName'], item['paper_time']))
    ret = self.cursor.fetchone()
    if ret:
    self.cursor.execute(
    """update reference set literature_title = %s,literature_author = %s, literature_journalName = %s,
    literature_time = %s,quote_title = %s
    where literature_title = %s and quote_title = %s AND literature_author = %s""",
    (str(item['paper_title']),
    str(item['paper_author']),
    str(item['paper_journalName']),
    str(item['paper_time']),
    str(item['quote_title']),
    str(item['paper_title']),
    str(item['quote_title']),
    str(item['paper_author'])))
    else:
    self.cursor.execute(
    """insert into reference(literature_title,literature_author,literature_journalName,
    literature_time,quote_title)
    value (%s,%s,%s,%s,%s)""",
    (str(item['paper_title']),
    str(item['paper_author']),
    str(item['paper_journalName']),
    str(item['paper_time']),
    str(item['quote_title'])))
    self.connect.commit()
    except Exception as error:
    log(error)
    return item
    item pipeline 中需要判断 item 的类型是否是需要被处理的类型,进而进行查询判断该条记录是否在数据库中存在,如果存在则进行更新,否则进行插入。
  5. 爬取结果

数据导出

为了将数据导出为 Excel 格式,利用到了xlrdxlsxwriterxlutils 库,导出代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
try:
with connect.cursor() as cursor:
# Create a new record
sql_quote = "select * from periodical_data where journalName = %s and publicationDate = %s ORDER BY referenceNum DESC"
sql_reference = "select * from reference where quote_title = %s"
cursor.execute(sql_quote, (title, publicationDate))
results = cursor.fetchall()
#oneup 用于记录引用数在每年一次以上的文献数,onelow 用于记录引用数在每年一次以下的文献数,noone 用于记录没有被引用的文献数
oneup=0
onelow=0
noone=0
endrows=0
for result in results:
reference=""
years=[]
resultwork = xlrd.open_workbook(year + title + ".xls")
sheet1 = resultwork.sheet_by_index(0)
old_excel = copy.copy(resultwork)
sheet_write = old_excel.get_sheet(0)
rows = sheet1.nrows
cols = sheet1.ncols
quote_title = result['title']
if result['referenceNum'] != "0":
cursor.execute(sql_reference, quote_title)
reference = cursor.fetchall()
if result['referenceNum']:
if result['referenceNum'] > (2016-int(year)):
if oneup == 0:
sheet_write.write(rows, 0, "引用每年一次以上")
rows = rows + 1
oneup = oneup + 1
sheet_write.write(rows, 0, "1")
sheet_write.write(rows, 1, result['title'])
sheet_write.write(rows, 2, result['author'])
sheet_write.write(rows, 3, result['journalName'])
sheet_write.write(rows, 4, result['publicationDate'])
sheet_write.write(rows, 5, result['referenceNum'])
sheet_write.write(rows, 6, result['downloadNum'])
else:
if onelow == 0:
sheet_write.write(rows, 0, "引用每年一次以下")
rows = rows + 1
onelow = onelow + 1
sheet_write.write(rows, 0, "1")
sheet_write.write(rows, 1, result['title'])
sheet_write.write(rows, 2, result['author'])
sheet_write.write(rows, 3, result['journalName'])
sheet_write.write(rows, 4, result['publicationDate'])
sheet_write.write(rows, 5, result['referenceNum'])
sheet_write.write(rows, 6, result['downloadNum'])
else:
if noone == 0:
sheet_write.write(rows, 0, "没有引用")
rows = rows + 1
noone = noone + 1
sheet_write.write(rows, 0, "1")
sheet_write.write(rows, 1, result['title'])
sheet_write.write(rows, 2, result['author'])
sheet_write.write(rows, 3, result['journalName'])
sheet_write.write(rows, 4, result['publicationDate'])
sheet_write.write(rows, 5, result['referenceNum'])
sheet_write.write(rows, 6, result['downloadNum'])

for row in range(len(reference)):
sheet_write.write(rows + row + 1, 0, "0")
sheet_write.write(rows+row+1, 1, reference[row]['literature_title'])
sheet_write.write(rows+row+1, 2, reference[row]['literature_author'])
sheet_write.write(rows+row+1, 3, reference[row]['literature_journalName'])
sheet_write.write(rows+row+1, 4, reference[row]['literature_time'])
years.append(reference[row]['literature_time'])
c = Counter(years)
print(c.most_common(1))
print(result['title'])
if result['referenceNum']:
if result['referenceNum'] > (2016-int(year)):
if c.most_common(1):
for most in c.most_common(1):
most_year = str(most).split(",")[0].split("(")[1]
most_time = str(most).split(",")[1].split(")")[0]
sheet_write.write(rows, 7, most_year)
sheet_write.write(rows, 8, most_time)
old_excel.save(year + title + ".xls")
endrows = rows + len(reference) + 2
resultwork = xlrd.open_workbook(year + title + ".xls")
sheet1 = resultwork.sheet_by_index(0)
old_excel = copy.copy(resultwork)
sheet_write = old_excel.get_sheet(0)
sheet_write.write(endrows, 0, "每年引用次数一次以上百分比:" + str(float(oneup)/len(results)))
sheet_write.write(endrows+1, 0, "每年引用次数一次以下百分比:" + str(float(onelow) / len(results)))
sheet_write.write(endrows+2, 0, "每年没有引用百分比:" + str(float(noone) / len(results)))
old_excel.save(year + title + ".xls")
finally:
connect.close()

导出效果:


参考资料

Scrapy 文档