Python 下用 Scrapy 采集知网期刊数据（四）

发表于 2017-04-08 更新于 2022-04-22 分类于 Archived 阅读次数： Disqus：本文字数： 419 阅读时长 ≈ 1 分钟

在Python 下用 Scrapy 采集知网期刊数据（三）中采集了所需数据，接下来该进行的就是数据的存储和导出。

数据存储

创建 MySql 表
导入 pymysql 库 import pymysql

配置 MySql 在 setting.py 文件中写入下列配置代码：

# start MySQL database configure setting
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = '数据库名'
MYSQL_USER = '登录名'
MYSQL_PASSWD = '密码'
# end of MySQL database configure setting

编写 item pipeline

每个 item pipeline 组件(有时称之为 “Item Pipeline” )是实现了简单方法的Python类。他们接收到Item并通过它执行一些行为，同时也决定此Item是否继续通过pipeline，或是被丢弃而不再进行处理。

以采集引证文献的 item pipeline 为例：

class CnkiPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(
            host=settings.MYSQL_HOST,
            db=settings.MYSQL_DBNAME,
            user=settings.MYSQL_USER,
            passwd=settings.MYSQL_PASSWD,
            charset='utf8',
            use_unicode=True)
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        if item.__class__ == Quote:
            try:
                self.cursor.execute("""select * from reference where literature_title = %s and quote_title = %s AND literature_author = %s
                                      AND literature_journalName = %s AND literature_time = %s""",
                                    (item["paper_title"], str(item['quote_title']), item['paper_author'],
                                    item['paper_journalName'], item['paper_time']))
                ret = self.cursor.fetchone()
                if ret:
                    self.cursor.execute(
                        """update reference set literature_title = %s,literature_author = %s, literature_journalName = %s,
                          literature_time = %s,quote_title = %s
                          where literature_title = %s and quote_title = %s AND literature_author = %s""",
                        (str(item['paper_title']),
                         str(item['paper_author']),
                         str(item['paper_journalName']),
                         str(item['paper_time']),
                         str(item['quote_title']),
                         str(item['paper_title']),
                         str(item['quote_title']),
                         str(item['paper_author'])))
                else:
                    self.cursor.execute(
                        """insert into reference(literature_title,literature_author,literature_journalName,
                          literature_time,quote_title)
                          value (%s,%s,%s,%s,%s)""",
                        (str(item['paper_title']),
                         str(item['paper_author']),
                         str(item['paper_journalName']),
                         str(item['paper_time']),
                         str(item['quote_title'])))
                self.connect.commit()
            except Exception as error:
                log(error)
            return item

item pipeline 中需要判断 item 的类型是否是需要被处理的类型，进而进行查询判断该条记录是否在数据库中存在，如果存在则进行更新，否则进行插入。

爬取结果

数据导出

为了将数据导出为 Excel 格式，利用到了xlrd，xlsxwriter，xlutils 库，导出代码如下：

try:
    with connect.cursor() as cursor:
        # Create a new record
        sql_quote = "select * from periodical_data where journalName = %s and publicationDate = %s ORDER BY referenceNum DESC"
        sql_reference = "select * from reference where quote_title = %s"
        cursor.execute(sql_quote, (title, publicationDate))
        results = cursor.fetchall()
        #oneup 用于记录引用数在每年一次以上的文献数，onelow 用于记录引用数在每年一次以下的文献数，noone 用于记录没有被引用的文献数
        oneup=0
        onelow=0
        noone=0
        endrows=0
        for result in results:
            reference=""
            years=[]
            resultwork = xlrd.open_workbook(year + title + ".xls")
            sheet1 = resultwork.sheet_by_index(0)
            old_excel = copy.copy(resultwork)
            sheet_write = old_excel.get_sheet(0)
            rows = sheet1.nrows
            cols = sheet1.ncols
            quote_title = result['title']
            if result['referenceNum'] != "0":
                cursor.execute(sql_reference, quote_title)
                reference = cursor.fetchall()
            if result['referenceNum']:
                if result['referenceNum'] > (2016-int(year)):
                    if oneup == 0:
                        sheet_write.write(rows, 0, "引用每年一次以上")
                        rows = rows + 1
                    oneup = oneup + 1
                    sheet_write.write(rows, 0, "1")
                    sheet_write.write(rows, 1, result['title'])
                    sheet_write.write(rows, 2, result['author'])
                    sheet_write.write(rows, 3, result['journalName'])
                    sheet_write.write(rows, 4, result['publicationDate'])
                    sheet_write.write(rows, 5, result['referenceNum'])
                    sheet_write.write(rows, 6, result['downloadNum'])
                else:
                    if onelow == 0:
                        sheet_write.write(rows, 0, "引用每年一次以下")
                        rows = rows + 1
                    onelow = onelow + 1
                    sheet_write.write(rows, 0, "1")
                    sheet_write.write(rows, 1, result['title'])
                    sheet_write.write(rows, 2, result['author'])
                    sheet_write.write(rows, 3, result['journalName'])
                    sheet_write.write(rows, 4, result['publicationDate'])
                    sheet_write.write(rows, 5, result['referenceNum'])
                    sheet_write.write(rows, 6, result['downloadNum'])
            else:
                if noone == 0:
                    sheet_write.write(rows, 0, "没有引用")
                    rows = rows + 1
                noone = noone + 1
                sheet_write.write(rows, 0, "1")
                sheet_write.write(rows, 1, result['title'])
                sheet_write.write(rows, 2, result['author'])
                sheet_write.write(rows, 3, result['journalName'])
                sheet_write.write(rows, 4, result['publicationDate'])
                sheet_write.write(rows, 5, result['referenceNum'])
                sheet_write.write(rows, 6, result['downloadNum'])

            for row in range(len(reference)):
                sheet_write.write(rows + row + 1, 0, "0")
                sheet_write.write(rows+row+1, 1, reference[row]['literature_title'])
                sheet_write.write(rows+row+1, 2, reference[row]['literature_author'])
                sheet_write.write(rows+row+1, 3, reference[row]['literature_journalName'])
                sheet_write.write(rows+row+1, 4, reference[row]['literature_time'])
                years.append(reference[row]['literature_time'])
            c = Counter(years)
            print(c.most_common(1))
            print(result['title'])
            if result['referenceNum']:
                if result['referenceNum'] > (2016-int(year)):
                    if c.most_common(1):
                        for most in c.most_common(1):
                            most_year = str(most).split(",")[0].split("(")[1]
                            most_time = str(most).split(",")[1].split(")")[0]
                            sheet_write.write(rows, 7, most_year)
                            sheet_write.write(rows, 8, most_time)
            old_excel.save(year + title + ".xls")
            endrows = rows + len(reference) + 2
        resultwork = xlrd.open_workbook(year + title + ".xls")
        sheet1 = resultwork.sheet_by_index(0)
        old_excel = copy.copy(resultwork)
        sheet_write = old_excel.get_sheet(0)
        sheet_write.write(endrows, 0, "每年引用次数一次以上百分比：" + str(float(oneup)/len(results)))
        sheet_write.write(endrows+1, 0, "每年引用次数一次以下百分比：" + str(float(onelow) / len(results)))
        sheet_write.write(endrows+2, 0, "每年没有引用百分比：" + str(float(noone) / len(results)))
        old_excel.save(year + title + ".xls")
finally:
    connect.close()

导出效果：

参考资料

Scrapy 文档