Парсване на www.shallalist.de списъка с "забранени" сайтове
Published on 03.03.2013
За един проект имах нужда да имам различните "лоши" адреси от www.shallalist.de под формата на mysql таблици и това е набързо написан скрипт който парсва и генерира sql готов за импортиране:
import sys import os dir = "C:\\Users\\Yuks\\Desktop\\BL\\" categories_table = "bl_categories" elements_table = "bl_elements" def ls(dir, hidden=False, relative=True): nodes = [] for nm in os.listdir(dir): if not hidden and nm.startswith('.'): continue if not relative: nm = os.path.join(dir, nm) nodes.append(nm) nodes.sort() return nodes def gen_items(): dirs = ls(dir) for el in dirs: try: file = dir + el + "/domains" with open(file) as fileobject: for line in fileobject: if line != "": rr = "INSERT INTO `{el_cat}` (`cat_name`, `url`) VALUES ('{cat_name}', '{domain}');" rr = rr.format(el_cat = elements_table,cat_name = el.strip(),domain = line.strip()) print rr except: pass def categories(): file = dir + "global_usage" r = [] with open(file) as fileobject: name = "" desc = "" for line in fileobject: tmp = {} if "#" not in line: if "NAME:" in line: name = line.replace("NAME:", "").strip() elif "DESC EN" in line: desc = line.replace("DESC EN","").strip() else: pass if name != "" and desc != "": rr = {} rr['name'] = name rr['desc'] = desc name = desc = "" r.append( rr ) for el in r: print (el['name'],el['desc']) print def gen_cats_sql_tables(): r = """ CREATE TABLE `{cat}` ( `id` INT(10) NOT NULL AUTO_INCREMENT, `name` VARCHAR(500) NOT NULL, `desc` TEXT NOT NULL, PRIMARY KEY (`id`) ) COLLATE='utf8_general_ci' ENGINE=InnoDB; """"" r = r.format(cat = categories_table) return r def gen_items_sql_tables(): r = """ CREATE TABLE `{elements}` ( `id` INT(10) NOT NULL AUTO_INCREMENT, `cat_name` VARCHAR(500) NOT NULL, `url` TEXT NOT NULL, PRIMARY KEY (`id`) ) COLLATE='utf8_general_ci' ENGINE=InnoDB; """"" r = r.format(elements = elements_table) return r print gen_cats_sql_tables() print gen_items_sql_tables() gen_items()Файлът със лошите адреси може да се свали от http://www.shallalist.de/Downloads/shallalist.tar.gz
ps: би трябвало да може да работи и с http://urlblacklist.com/