-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathNNClassifier.py
More file actions
123 lines (112 loc) · 5.45 KB
/
NNClassifier.py
File metadata and controls
123 lines (112 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
#-*-coding:utf-8-*-
'''@author:duncan'''
import DataPrepare as datapre
import Metric as metric
import GreedyAlgorithm as greedy
import SAalgo as sa
import KMedoids_Clustering as kmediods
import Initial as init
import TwitterWithNeo4j as neo4j
import PageRank as pr
import pickle
from numpy import *
class Classifier:
def __init__(self,features):
self.features = features
# 先将数据集进行3:7划分,形成训练集和测试集
def Split(self):
# 返回结果为训练集和测试集
train_set = {}
test_set = {}
# 对原集中每个领域取3/10加入train_set,取7/10加入test_set
people = datapre.People(self.features)
categories = datapre.GetUserCategory()
for category in categories:
domain_people = people[category]
train_set_number = int(len(domain_people) * 0.3) + 1
count = 0
for id in domain_people:
if count < train_set_number:
train_set[id] = self.features[id]
count += 1
else:
break
# 将剩余的用户加入
left = set(self.features.keys()) - set(train_set.keys())
for id in left:
test_set[id] = self.features[id]
return train_set,test_set
# 用每个方法提取出来的代表性人物来做NN分类,并计算准确率
def Classify(self,profiles,test_set):
print "连接neo4j数据库"
driver,session = neo4j.Conn()
# 获取profiles的followers,并以字典存储
followers = {}
for profile in profiles:
followers[profile] = set(neo4j.GetFollowers(driver,session,profile))
# 对test_set遍历,对每个其中的元素,用与profiles中的对其代表性最大的元素来标记其领域
results = {}
for element in test_set.keys():
# followings = set(neo4j.GetFollowings(driver,session,element))
results[element] = self.features[max({profile:1.5 * metric.CRepre(self.features[profile],self.features[element]) if element in followers[profile] else metric.CRepre(self.features[profile],self.features[element]) for profile in profiles}.items(),key=lambda dic:dic[1])[0]][5]
# 计算准确性
count = 0
for result in results.keys():
if self.features[result][5] == results[result]:
count += 1
print "Accuracy is %.3f" % (count * 1.0 / len(results))
driver.close()
session.close()
return (count * 1.0 / len(results))
def test():
method = Classifier(datapre.Features())
train_set,test_set = method.Split()
print "数据集分割完成"
print "训练集和测试集数量为:%d,%d" % (len(train_set),len(test_set))
# 三个方法分别在train_set中寻找100个代表性人物,用代表性人物来分类test_set
epsilons = [0.1560,0.1556,0.1555]
# 将PageRank提取出来的100个用户也来做个分类
# PageRank_method = pr.PageRank(40,train_set,datapre.GetUserCategory())
# # 获得出入度矩阵
# uMatrix = PageRank_method.GetUserMatrix()
# #
# # 转移矩阵
# fMatrix = mat([(1 - 0.85) / len(train_set.keys()) for i in range(len(train_set.keys()))]).T
# # 初始矩阵
# initPRMatrix = mat([1 for i in range(len(train_set.keys()))]).T
# # result为影响力分数结果
# PRMatrix = PageRank_method.PageRank(uMatrix,fMatrix,0.85,initPRMatrix,0.01,120)
# user_ids = train_set.keys()
# uPR = {}
# for i,id in zip(range(len(user_ids)),user_ids):
# uPR[id] = PRMatrix[i]
# # 对uPR排序
# uPR = sorted(uPR.items(),key = lambda dic:dic[1],reverse=True)
# profiles = [u[0] for u in uPR[:100]]
# print "PageRank的分类准确性为%.3f" % method.Classify(profiles,test_set)
# return
# epsilons = [0.1556,0.1555]
# epsilons = [0.1560]
# init.InitialMatrix(train_set)
number = [40,60,80,100]
print "开始抽取代表性用户"
for epsilon in epsilons:
with open("%.4f" % epsilon,"wb") as f:
for k in number:
profiles1 = greedy.Greedy(k,train_set,datapre.CategoriesDistribution(),epsilon).SearchWithReplace()
print "GB方法计算完成"
profiles2 = kmediods.KMedoids(k,train_set,datapre.CategoriesDistribution(),epsilon).Search()
print "kmedoids方法计算完成"
profiles3 = sa.SAalgo(k,train_set,datapre.CategoriesDistribution(),epsilon,0.3,10,0.9).Search()
print "sa方法计算完成"
accuracy1 = method.Classify(profiles1,test_set)
f.write("方法:GB; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy1))
accuracy2 = method.Classify(profiles2,test_set)
f.write("方法:kmedoids; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy2))
accuracy3 = method.Classify(profiles3,test_set)
f.write("方法:SA; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy3))
print "方法:GB; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy1)
print "方法:kmedoids; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy2)
print "方法:SA; 典型阈值:%f; 代表性子集数量:%d; 准确率:%.3f \n" % (epsilon,k,accuracy3)
test()