Python从数据库里面取字段爬图片1.0版本
Python从数据库里面取字段爬图片1.0版本
#!/usr/bin/python3 #可缺省,为了指明python脚本解析器的路径
#-*- coding: utf-8 -*- #可缺省,为了告知python脚本文件解析器此脚本的字符集
import urllib.request
import urllib.parse
import os
import pymysql
import re
'''
有拓展名:严格按照目录存储在domain文件夹下
无拓展名:直接存储在domain目录下的保留下划线英文数字的文件
'''
class Downloadphotos:
def __init__(self):
pass
def run(self):
print("running...")
offset = 0
limit = 5
data = ["default"]
while len(data)>0:
data = self.files("localhost","website","website123","beiiibota", "bota_photo", "corver,gallery", offset, limit)
#print(data)
self.load(data)
offset += limit
print(offset, "-", limit)
print("ok")
return False
'''获取数据库指定字段的URL文件'''
def files(self, host, user, pwd, db, table, cols, offset, limit):
#db = pymysql.connect("localhost","website","website123","beiiibota")
db = pymysql.connect(host, user, pwd, db)
cursor = db.cursor()
cursor.execute("SELECT {cols} FROM `{table}` LIMIT {offset}, {limit}".format(table=table, cols=cols, offset=offset, limit=limit))
result = cursor.fetchall()
data = []
rg1 = re.compile(r"^http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$")
rg2 = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
for item in result:
#print(result, cursor.rownumber)
#print(cursor.rownumber, "===")
for url in item:
if rg1.match(url):
data.append(url)
else:
ea = rg2.findall(url)
for one in ea:
data.append(one)
db.close()
return data
'''传入URL数组循环下载核心方法'''
def load(self, data):
count = 0
for i in data:
count+=1
if self.found(i):
print(str(count) + ".已跳过:" + urllib.parse.urlparse(i).path)
#continue
else:
print(str(count) + ".正在下载:" + urllib.parse.urlparse(i).path)
try:
response = urllib.request.urlopen(i)
except HTTPError as e:
print("The HTTPError:" + e.code)
except URLError as e:
print("The URLError:"+e.reason)
img = response.read()
with open(self.fullpath(i), "wb") as f:
f.write(img)
return True
'''传入url返回符合规则的指定文件元组'''
def found(self, url):
urls = urllib.parse.urlparse(url)
'''print(urls.netloc.replace(".", "_").replace("-", "_"))'''
base = os.getcwd() + os.sep + urls.netloc.replace(".", "_").replace("-", "_")
if not os.path.exists(base):
os.mkdir(base)
path = os.path.dirname(urls.path.replace("/", os.sep))
file = os.path.basename(urls.path)
if not os.path.isdir(base + path):
os.makedirs(base + path)
if os.path.exists(base + path + os.sep + file):
return True
#return {"base":base, "path":path, "file":file}
return False
'''返回完整目录文件名'''
def fullpath(self, url):
urls = urllib.parse.urlparse(url)
base = os.getcwd() + os.sep + urls.netloc.replace(".", "_").replace("-", "_")
return base + urls.path.replace("/", os.sep)
if __name__ == "__main__":
print("this is downloadphotos...")
dp = Downloadphotos()
dp.run()