За един проект имах нужда да имам различните "лоши" адреси от www.shallalist.de под формата на mysql таблици и това е набързо написан скрипт който парсва и генерира sql готов за импортиране:
import sys
import os
dir = "C:\\Users\\Yuks\\Desktop\\BL\\"
categories_table = "bl_categories"
elements_table = "bl_elements"
def ls(dir, hidden=False, relative=True):
nodes = []
for nm in os.listdir(dir):
if not hidden and nm.startswith('.'):
continue
if not relative:
nm = os.path.join(dir, nm)
nodes.append(nm)
nodes.sort()
return nodes
def gen_items():
dirs = ls(dir)
for el in dirs:
try:
file = dir + el + "/domains"
with open(file) as fileobject:
for line in fileobject:
if line != "":
rr = "INSERT INTO `{el_cat}` (`cat_name`, `url`) VALUES ('{cat_name}', '{domain}');"
rr = rr.format(el_cat = elements_table,cat_name = el.strip(),domain = line.strip())
print rr
except:
pass
def categories():
file = dir + "global_usage"
r = []
with open(file) as fileobject:
name = ""
desc = ""
for line in fileobject:
tmp = {}
if "#" not in line:
if "NAME:" in line:
name = line.replace("NAME:", "").strip()
elif "DESC EN" in line:
desc = line.replace("DESC EN","").strip()
else:
pass
if name != "" and desc != "":
rr = {}
rr['name'] = name
rr['desc'] = desc
name = desc = ""
r.append( rr )
for el in r:
print (el['name'],el['desc'])
print
def gen_cats_sql_tables():
r = """
CREATE TABLE `{cat}` (
`id` INT(10) NOT NULL AUTO_INCREMENT,
`name` VARCHAR(500) NOT NULL,
`desc` TEXT NOT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8_general_ci'
ENGINE=InnoDB;
"""""
r = r.format(cat = categories_table)
return r
def gen_items_sql_tables():
r = """
CREATE TABLE `{elements}` (
`id` INT(10) NOT NULL AUTO_INCREMENT,
`cat_name` VARCHAR(500) NOT NULL,
`url` TEXT NOT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8_general_ci'
ENGINE=InnoDB;
"""""
r = r.format(elements = elements_table)
return r
print gen_cats_sql_tables()
print gen_items_sql_tables()
gen_items()
Файлът със лошите адреси може да се свали от
http://www.shallalist.de/Downloads/shallalist.tar.gz
ps: би трябвало да може да работи и с
http://urlblacklist.com/