Heads up! This post was written 13 years ago. Some information might be outdated or may have changed since then.
За един проект имах нужда да имам различните "лоши" адреси от www.shallalist.de под формата на mysql таблици и това е набързо написан скрипт който парсва и генерира sql готов за импортиране:
import sys
import os

dir = "C:\\Users\\Yuks\\Desktop\\BL\\"
categories_table = "bl_categories"
elements_table = "bl_elements"

def ls(dir, hidden=False, relative=True):
    nodes = []
    for nm in os.listdir(dir):
        if not hidden and nm.startswith('.'):
            continue
        if not relative:
            nm = os.path.join(dir, nm)
        nodes.append(nm)
    nodes.sort()
    return nodes

def gen_items():
    dirs = ls(dir)
    for el in dirs:
        try:
            file = dir + el + "/domains"
            with open(file) as fileobject:
                for line in fileobject:
                    if line != "":
                        rr = "INSERT INTO `{el_cat}` (`cat_name`, `url`) VALUES ('{cat_name}', '{domain}');"
                        rr = rr.format(el_cat=elements_table, cat_name=el.strip(), domain=line.strip())
                        print rr
        except:
            pass

def categories():
    file = dir + "global_usage"
    r = []
    with open(file) as fileobject:
        name = ""
        desc = ""
        for line in fileobject:
            tmp = {}
            if "#" not in line:
                if "NAME:" in line:
                    name = line.replace("NAME:", "").strip()
                elif "DESC EN" in line:
                    desc = line.replace("DESC EN", "").strip()
                else:
                    pass
                
                if name != "" and desc != "":
                    rr = {}
                    rr['name'] = name
                    rr['desc'] = desc
                    name = desc = ""
                    r.append(rr)
    
    for el in r:
        print (el['name'], el['desc'])
    print

def gen_cats_sql_tables():
    r = """ CREATE TABLE `{cat}` (
`id` INT(10) NOT NULL AUTO_INCREMENT,
`name` VARCHAR(500) NOT NULL,
`desc` TEXT NOT NULL,
PRIMARY KEY (`id`)
) COLLATE='utf8_general_ci' ENGINE=InnoDB;
"""
    r = r.format(cat=categories_table)
    return r

def gen_items_sql_tables():
    r = """ CREATE TABLE `{elements}` (
`id` INT(10) NOT NULL AUTO_INCREMENT,
`cat_name` VARCHAR(500) NOT NULL,
`url` TEXT NOT NULL,
PRIMARY KEY (`id`)
) COLLATE='utf8_general_ci' ENGINE=InnoDB;
"""
    r = r.format(elements=elements_table)
    return r

print gen_cats_sql_tables()
print gen_items_sql_tables()
gen_items()
Файлът със лошите адреси може да се свали от http://www.shallalist.de/Downloads/shallalist.tar.gz
ps: би трябвало да може да работи и с http://urlblacklist.com/

Back to all posts