首页 > 技术文章 > Apriori 获取关联规则实现

Merodach 2018-05-15 13:40 原文

前言

自己的一个Apriori 获取关联规则的python实现。具体原理不讲,代码添加了说明,还是很好理解的。

数据预处理

#最小置信度
min_conf = 0.5
#最小支持度
min_sup = 2
f=open('data.txt')
dataset = f.readlines()
print(dataset)
['T1\tbread, cream, milk, tea\n', 'T2\tbread, cream, milk\n', 'T3\tcake, milk\n', 'T4\tmilk, tea\n', 'T5\tbread, cake, milk\n', 'T6\tbread, tea\n', 'T7\tbeer, milk, tea\n', 'T8\tbread, tea\n', 'T9\tbread, cream, milk, tea\n', 'T10\tbread, milk, tea']
dataset = [data.replace('\n','').split('\t') for data in dataset]
print(dataset)
[['T1', 'bread, cream, milk, tea'], ['T2', 'bread, cream, milk'], ['T3', 'cake, milk'], ['T4', 'milk, tea'], ['T5', 'bread, cake, milk'], ['T6', 'bread, tea'], ['T7', 'beer, milk, tea'], ['T8', 'bread, tea'], ['T9', 'bread, cream, milk, tea'], ['T10', 'bread, milk, tea']]
dataset = [tuple([data[0],sorted(data[1].replace(' ', '').split(','))]) for data in dataset]
print(dataset)
[('T1', ['bread', 'cream', 'milk', 'tea']), ('T2', ['bread', 'cream', 'milk']), ('T3', ['cake', 'milk']), ('T4', ['milk', 'tea']), ('T5', ['bread', 'cake', 'milk']), ('T6', ['bread', 'tea']), ('T7', ['beer', 'milk', 'tea']), ('T8', ['bread', 'tea']), ('T9', ['bread', 'cream', 'milk', 'tea']), ('T10', ['bread', 'milk', 'tea'])]
terms = [term for data in dataset for term in data[1]]
terms.sort()
terms = [terms[i] for i in range(0,len(terms)) if i==0 or terms[i]!=terms[i-1]]
print(terms)
['beer', 'bread', 'cake', 'cream', 'milk', 'tea']

Aprior寻找频繁项集

def is_sub_seq(P, T):
    '''判断P是否为T的子序列
    Parameters
    --------
    P: 一个有序序列
    T: 一个有序序列
    '''
    i, j = 0, 0
    while(i<len(P) and j<len(T)):
        if(P[i]==T[j]):
            i+=1
        j+=1
    return i==len(P)
def Aprior_sieve(L):
    '''从一个项集组成的序列中中筛选出频繁项集
    Parameters
    ---
    L: 一个项集组成的序列
    
    Returns
    ---
    一个频繁项集和它支持度组成的序列
    '''
    L = [[l,0] for l in L]
    for l in L:
        for data in dataset:
            if(is_sub_seq(l[0], data[1])):
                l[1] += 1
    L = [l for l in L if l[1]>=minsup]
    return L
    
def Aprior_gen(L,k):
    '''通过k项集构造k+1项集
    Parameters
    ---
    L:一个频繁k项集和它支持度组成的序列
    k:频繁k项集的项数
    
    Returns
    ---
    一个k+1项集组成的序列
    '''
    print(k,":\t",L)
    NL = []
    myset = {tuple(l[0]) for l in L}
    for i in range(0, len(L)):
        for j in range(i+1, len(L)):
            if(L[i][0][:k-1]==L[j][0][:k-1]):
                nl = L[i][0].copy()
                nl.append(L[j][0][k-1])
                ok = True
                for r in range(0, k-1):
                    tmp = nl.copy()
                    tmp.pop(r)
                    tmp = tuple(tmp)
                    if(tmp not in myset):
                        ok = False
                        break
                if(ok):
                    NL.append(nl)
            else:
                break
    return NL
                
L = [[term] for term in terms]
L = Aprior_sieve(L)
print(L)
[[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]]
Ans = []
Ans.append(L)
for i in range(1,len(terms)):
    L = Aprior_gen(Ans[i-1],i)
    L = Aprior_sieve(L)
    if(len(L)==0):
        break
    Ans.append(L)
print(Ans)
1 :	 [[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]]
2 :	 [[['bread', 'cream'], 3], [['bread', 'milk'], 5], [['bread', 'tea'], 5], [['cake', 'milk'], 2], [['cream', 'milk'], 3], [['cream', 'tea'], 2], [['milk', 'tea'], 5]]
3 :	 [[['bread', 'cream', 'milk'], 3], [['bread', 'cream', 'tea'], 2], [['bread', 'milk', 'tea'], 3], [['cream', 'milk', 'tea'], 2]]
4 :	 [[['bread', 'cream', 'milk', 'tea'], 2]]
[[[['bread'], 7], [['cake'], 2], [['cream'], 3], [['milk'], 8], [['tea'], 7]], [[['bread', 'cream'], 3], [['bread', 'milk'], 5], [['bread', 'tea'], 5], [['cake', 'milk'], 2], [['cream', 'milk'], 3], [['cream', 'tea'], 2], [['milk', 'tea'], 5]], [[['bread', 'cream', 'milk'], 3], [['bread', 'cream', 'tea'], 2], [['bread', 'milk', 'tea'], 3], [['cream', 'milk', 'tea'], 2]], [[['bread', 'cream', 'milk', 'tea'], 2]]]

获取关联规则

mydict = { tuple(l[0]):l[1] for i in range(0, len(Ans)) for l in Ans[i]}
print(mydict)
R=set()
{('bread',): 7, ('cake',): 2, ('cream',): 3, ('milk',): 8, ('tea',): 7, ('bread', 'cream'): 3, ('bread', 'milk'): 5, ('bread', 'tea'): 5, ('cake', 'milk'): 2, ('cream', 'milk'): 3, ('cream', 'tea'): 2, ('milk', 'tea'): 5, ('bread', 'cream', 'milk'): 3, ('bread', 'cream', 'tea'): 2, ('bread', 'milk', 'tea'): 3, ('cream', 'milk', 'tea'): 2, ('bread', 'cream', 'milk', 'tea'): 2}
def conf(rule):
    return mydict[rule[1]]/mydict[rule[0]]
def gen_rule(X, Y):
    for item in Y:
        if item not in X:
            nX = X.copy()
            nX.append(item)
            nX.sort()
            rule = (tuple(nX),Y)
            if(rule not in R and conf(rule)>=min_conf):
                R.add(rule)
                gen_rule(nX, Y)
for l in mydict.keys():
    gen_rule([],l)
R = [(f, tuple(set(b)-set(f))) for f, b in R]
R.sort()
print(R)
[(('bread',), ()), (('bread',), ('milk',)), (('bread',), ('tea',)), (('bread', 'cream'), ()), (('bread', 'cream'), ('milk',)), (('bread', 'cream'), ('milk', 'tea')), (('bread', 'cream'), ('tea',)), (('bread', 'cream', 'milk'), ()), (('bread', 'cream', 'milk'), ('tea',)), (('bread', 'cream', 'milk', 'tea'), ()), (('bread', 'cream', 'tea'), ()), (('bread', 'cream', 'tea'), ('milk',)), (('bread', 'milk'), ()), (('bread', 'tea'), ()), (('cake',), ()), (('cake',), ('milk',)), (('cake', 'milk'), ()), (('cream',), ()), (('cream',), ('bread',)), (('cream',), ('bread', 'milk')), (('cream',), ('bread', 'milk', 'tea')), (('cream',), ('bread', 'tea')), (('cream',), ('milk',)), (('cream',), ('milk', 'tea')), (('cream',), ('tea',)), (('cream', 'milk'), ()), (('cream', 'milk'), ('bread',)), (('cream', 'milk'), ('bread', 'tea')), (('cream', 'milk'), ('tea',)), (('cream', 'milk', 'tea'), ()), (('cream', 'milk', 'tea'), ('bread',)), (('cream', 'tea'), ()), (('cream', 'tea'), ('bread',)), (('cream', 'tea'), ('bread', 'milk')), (('cream', 'tea'), ('milk',)), (('milk',), ()), (('milk',), ('bread',)), (('milk',), ('tea',)), (('milk', 'tea'), ()), (('tea',), ()), (('tea',), ('bread',)), (('tea',), ('milk',))]
print(len(mydict), len(R))
17 42

感想

算法的原理还是比较简单的,但实现起来还是要花些功夫。另外使用python的一些特性可以极大简化代码实现,如列表解析(学到了一种多重循环的解析),容器转换。踩了一波语法特性的坑,比如copy,dict的键为容器的话只能用tuple

推荐阅读