Python从数据库里面取字段爬图片1.0版本


Python从数据库里面取字段爬图片1.0版本

#!/usr/bin/python3   #可缺省,为了指明python脚本解析器的路径
#-*- coding: utf-8 -*-   #可缺省,为了告知python脚本文件解析器此脚本的字符集
import urllib.request
import urllib.parse
import os
import pymysql
import re
'''
有拓展名:严格按照目录存储在domain文件夹下
无拓展名:直接存储在domain目录下的保留下划线英文数字的文件
'''
class Downloadphotos:
    def __init__(self):
    pass

def run(self):
    print("running...")
    offset = 0
    limit = 5
    data = ["default"]
    while len(data)>0:
        data = self.files("localhost","website","website123","beiiibota", "bota_photo", "corver,gallery", offset, limit)
        #print(data)

    self.load(data)
    offset += limit
    print(offset, "-", limit)

    print("ok")
    return False

'''获取数据库指定字段的URL文件'''
def files(self, host, user, pwd, db, table, cols, offset, limit):
    #db = pymysql.connect("localhost","website","website123","beiiibota")
    db = pymysql.connect(host, user, pwd, db)
    cursor = db.cursor()

    cursor.execute("SELECT {cols} FROM `{table}` LIMIT {offset}, {limit}".format(table=table, cols=cols, offset=offset, limit=limit))
    result = cursor.fetchall()
    data = []
    rg1 = re.compile(r"^http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$")
    rg2 = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
    for item in result:
        #print(result, cursor.rownumber)
        #print(cursor.rownumber, "===")
        for url in item:
            if rg1.match(url):
                data.append(url)
            else:
                ea = rg2.findall(url)
            for one in ea:
                data.append(one)


    db.close()
    return data

'''传入URL数组循环下载核心方法'''
def load(self, data):
    count = 0
    for i in data:
        count+=1
        if self.found(i):
            print(str(count) + ".已跳过:" + urllib.parse.urlparse(i).path)
            #continue
        else:
            print(str(count) + ".正在下载:" + urllib.parse.urlparse(i).path)

        try:
            response = urllib.request.urlopen(i)
        except HTTPError as e:
            print("The HTTPError:" + e.code)
        except URLError as e:
            print("The URLError:"+e.reason)
        img = response.read()
        with open(self.fullpath(i), "wb") as f:
            f.write(img)
    return True


'''传入url返回符合规则的指定文件元组'''
def found(self, url):
    urls = urllib.parse.urlparse(url)
    '''print(urls.netloc.replace(".", "_").replace("-", "_"))'''
    base = os.getcwd() + os.sep + urls.netloc.replace(".", "_").replace("-", "_")
    if not os.path.exists(base):
        os.mkdir(base)
    path = os.path.dirname(urls.path.replace("/", os.sep))
    file = os.path.basename(urls.path)

    if not os.path.isdir(base + path):
        os.makedirs(base + path)

    if os.path.exists(base + path + os.sep + file):
        return True
    #return {"base":base, "path":path, "file":file}
    return False
    
'''返回完整目录文件名'''
def fullpath(self, url):
    urls = urllib.parse.urlparse(url)
    base = os.getcwd() + os.sep + urls.netloc.replace(".", "_").replace("-", "_")
    return base + urls.path.replace("/", os.sep)

if __name__ == "__main__":
    print("this is downloadphotos...")
    dp = Downloadphotos()
    dp.run()

原文链接:https://blog.yongit.com/note/128760.html