Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as Plain Text by registered user tanmay ( 8 years ago )
CRAWLING
CRAWLING:
CODE-
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import json
class LinkParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (key, value) in attrs:
if key == 'href':
newUrl = parse.urljoin(self.baseUrl, value)
self.links.append(newUrl)
def getLinks(self, url, numberVisited):
self.links = []
self.baseUrl = url
response = urlopen(url)
x = response.read()
if 'text/html' in response.getheader('Content-Type'):
htmlBytes = x #response.read()
file_name = str(numberVisited)+'.txt'
htmlString = htmlBytes.decode("utf-8")
file = open(file_name, 'wb')
file.write(htmlBytes)
file.close()
f = open("FileCheckDatabase", 'a')
f.write(url+'\n')
f.close()
print ("File Writing Successful of"+str(numberVisited)+"!")
self.feed(htmlString)
return self.links
else:
return []
def spider(url, maxPages):
pagesToVisit = [url]
print (pagesToVisit)
numberVisited = 0
while numberVisited < maxPages xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed query = "WikiPedia is a very good site for learning, and is very helpful for children" xss=removed xss=removed xss=removed xss=removed xss=removed encoding="UTF-8" xss=removed xss=removed xss=removed xss=removed xss=removed>", cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))
cosine_similar_value = []
for i in range(1, len(train_set)):
c = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train[i])
c = str(c)
c = c[2:]
c = c[:-2]
c = float(c)
cosine_similar_value.append(c)
#print ("Finished finding the cosine values..")
#print (len(cosine_similar_value))
for j in range(0, len(file_names)):
d[j] = cosine_similar_value[j]
cosine_similar_value.sort(reverse=True)
#print (cosine_similar_value)
file = open('FileCheckDatabase', 'r')
data = file.read()
l = data.split()
for i in range(0, 10):
x = list(d.keys())[list(d.values()).index(cosine_similar_value[i])]
#print (list(d.keys())[list(d.values()).index(cosine_similar_value[i])])
del d[x]
h = file_names[x]
h = int(h[:-4])
#print (file_names[x])
#print (cosine_similar_value[i])
print (l[h-1])
CENTRALITY
import networkx as nx
import matplotlib.pyplot as plt
G=nx.read_edgelist('node.txt',create_using=nx.Graph(),nodetype=int)
print(nx.info(G))
nx.draw(G,with_labels=True)
print("\nDegree Centrality: ")
print(nx.degree_centrality(G))
print("\n The node with max degree centrality is")
print(max(nx.degree_centrality(G)));
print("\nBetweenness Centrality: ")
print(nx.betweenness_centrality(G))
print("\n The node with max betweenness centrality is")
print(max(nx.betweenness_centrality(G)));
print("\nCloseness Centrality: ")
print(nx.closeness_centrality(G))
print("\n The node with max closeness centrality is")
print(max(nx.closeness_centrality(G)))
plt.show()
PRESTIGE,CO-CITATION
import networkx as nx
import matplotlib.pyplot as plt
g = nx.DiGraph()
g.add_edges_from([(1, 2), (1, 3), (3, 1), (3, 2), (2, 4), (3, 4), (4, 3)])
print (nx.info(g))
print ("Degree Prestige:")
degree_prestige = dict((v, len(g.in_edges(v))/(g.number_of_nodes() - 1)) for v in list(g.nodes))
print (degree_prestige)
print ("*******")
length = nx.all_pairs_shortest_path_length(g)
d = dict(length)
final_dict = {}
for i in range(1, g.number_of_nodes()+1):
k = 0
counter=0
for j in range(1, g.number_of_nodes()+1):
try:
k = k + d[j][i]
counter+=1
except:
break
final_dict[i] = k/(counter-1)
print ('Proximity Prestige')
print (final_dict)
print ('*************')
matrix = nx.to_numpy_matrix(g)
list1 = matrix.tolist()
final_list = []
for i in range(0, len(list1)):
trial = []
for j in range(0, len(list1)):
sum3 = 0
for k in range(0, len(list1)):
sum3 = list1[k][j]*list1[k][i] + sum3
trial.append(sum3)
final_list.append(trial)
print ("Co-Citation:")
print (final_list)
print ('********')
final_coupling = []
for i in range(0, len(list1)):
trial_2 = []
for j in range(0, len(list1)):
sum4 = 0
for k in range(0, len(list1)):
sum4 = sum4 + list1[i][k]*list1[j][k]
trial_2.append(sum4)
final_coupling.append(trial_2)
print ("Bibliographic Coupling:")
print (final_coupling)
print ("******")
print ("Page Rank:")
print (nx.pagerank(g))
nx.draw(g,with_labels='true')
plt.show()
PAGE RANK USING INLINK OUTLINK
import matplotlib.pyplot as plt
from networkx import nx
G=nx.DiGraph()
G.add_edges_from([(1,2),(1,4),(2,3),(3,4)])
def pagerank(G, alpha=0.85, personalization=None,max_iter=100, tol=1.0e-6, nstart=None, weight='weight',dangling=None):
if len(G) == 0:
return {}
if not G.is_directed():
D = G.to_directed()
else:
D = G
W = nx.stochastic_graph(D, weight=weight)
N = W.number_of_nodes()
if nstart is None:
x = dict.fromkeys(W, 1.0 / N)
else:
s = float(sum(nstart.values()))
x = dict((k, v / s) for k, v in nstart.items())
p = dict.fromkeys(W, 1.0 / N)
dangling_weights = p
dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
for _ in range(max_iter):
xlast = x
x = dict.fromkeys(xlast.keys(), 0)
danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
for n in x:
for nbr in W[n]:
x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n]
err = sum([abs(x[n] - xlast[n]) for n in x])
if err < N h,a=nx.hits(G) query = "data mining web mining" xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed xss=removed cosine_similar_value.sort(reverse=True) k=0; xss=removed k=k+1; xss=removed xss=removed hits(G,max_iter=100,tol e-8,nstart=None,normalized xss=removed xss=removed xss=removed h=dict.fromkeys(G,1.0/G.number_of_nodes()) h=nstart s=1.0/sum(h.values()) h[k]*=s i=0 hlast=h h=dict.fromkeys(hlast.keys(),0) a=dict.fromkeys(hlast.keys(),0) a[nbr]+=hlast[n]*G[n][nbr].get( h[n]+=a[nbr]*G[n][nbr].get( s=1.0/max(h.values()) h[n]*=s s=1.0/max(a.values()) a[n]*=s err=sum([abs(h[n]-hlast[n])>max_iter:
raise NetworkXError(\
"HITS: power iteration failed to converge in %d iterations."%(i+1))
i+=1
if normalized:
s = 1.0/sum(a.values())
for n in a:
a[n] *= s
s = 1.0/sum(h.values())
for n in h:
h[n] *= s
print("\nHub Score:\n")
print(h)
print("\nAuthority Score:\n")
print(a)
def authority_matrix(G,nodelist=None):
M=nx.to_numpy_matrix(G,nodelist=nodelist)
return M.T*M
def hub_matrix(G,nodelist=None):
M=nx.to_numpy_matrix(G,nodelist=nodelist)
return M*M.T
print(nx.info(G))
print("\n\n\nHub Matrix:\n")
print(hub_matrix(G))
print("\nAuthority Matrix:\n")
print(authority_matrix(G))
hits(G)
nx.draw(G)
plt.show()
K-MEANS
import pandas as pd
import numpy as np
data=pd.read_excel('kmeans.xlsx') #Include your data file instead of data.xlsx
idea=data.iloc[:,0:1] #Selecting the first column that has text.
corpus=[]
for index,row in idea.iterrows():
corpus.append(row['Idea'])
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Idea'].values.astype(str))
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
print(tfidf.shape )
from sklearn.cluster import KMeans
num_clusters = 5 #Change it according to your data.
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf)
clusters = km.labels_.tolist()
idea={'Content':corpus, 'Cluster':clusters} #Creating dict having doc with the corresponding cluster number.
frame=pd.DataFrame(idea,index=[clusters], columns=['Content','Cluster']) # Converting it into a dataframe.
print("\n")
print(frame) #Print the doc with the labeled cluster number.
print("\n")
print(frame['Cluster'].value_counts())
NAÏVE BAYES
def extract_vocab():
with open('vocab.txt') as f:
words = f.read().splitlines()
return {word: index for index, word in enumerate(words)}
def read_samples(path):
with open(path) as f:
samples = []
string_samples = f.read().splitlines()
for sample in string_samples:
sample = sample.strip()
sample_elements = sample.split('\t')
documents_id = sample_elements[0]
real_class = int(sample_elements[1])
words = sample_elements[2].split(' ')
samples.append((documents_id, real_class, words))
return samples
def train_model(word_2_num):
prior_probabilities = {}
word_likelihoods_per_class = {}
samples = read_samples('Train.txt')
for sample in samples:
prior_probabilities[sample[1]] = prior_probabilities.get(sample[1], 0.0) + 1.0
word_likelihoods = word_likelihoods_per_class.setdefault(sample[1], {})
for word in sample[2]:
word_likelihoods[word_2_num[word]] = word_likelihoods.get(word_2_num[word], 0.0) + 1.0
number_of_documents = sum(prior_probabilities.values())
for class_index in prior_probabilities:
prior_probabilities[class_index] /= number_of_documents
for class_index in word_likelihoods_per_class:
word_likelihoods = word_likelihoods_per_class[class_index]
per_class_words = sum(word_likelihoods.values())
for word_num in word_2_num.values():
word_likelihoods[word_num] = (1.0 + word_likelihoods.get(word_num, 0.0)) / (
per_class_words + float(len(word_2_num)))
return prior_probabilities, word_likelihoods_per_class
def classify(sample_words, word2num, prior_probabilities, word_likelihoods_per_class):
class_aposteriori_probabilities = {}
for class_index in prior_probabilities:
class_probability = prior_probabilities[class_index]
word_likelihoods = word_likelihoods_per_class[class_index]
for sample_word in sample_words:
class_probability *= word_likelihoods[word2num[sample_word]]
class_aposteriori_probabilities[class_index] = class_probability
total_sample_probability = sum(class_aposteriori_probabilities.values())
for class_index in prior_probabilities:
class_aposteriori_probabilities[class_index]/= total_sample_probability
sorted_classes = sorted(class_aposteriori_probabilities.keys(), key = class_aposteriori_probabilities.get)
return sorted_classes[-1]
def evaluate(path, word_2_num, prior_probabilities, word_likelihoods_per_class):
samples = read_samples('Test.txt')
accuracy = 0.0
print('Predictions on test data')
for sample in samples:
predicted_class = classify(sample[2], word_2_num, prior_probabilities, word_likelihoods_per_class)
print('{} = {}'.format(sample[0], predicted_class))
if predicted_class == sample[1]:
accuracy+= 1
accuracy = accuracy*100 / len(samples)
print()
print("Accuracy on test data = {}%".format(accuracy))
if __name__ == '__main__':
word_2_num = extract_vocab()
prior_probabilities, word_likelihoods_per_class = train_model(word_2_num)
print('Prior probabilities')
for class_index in prior_probabilities:
print('class {} = {}'.format(class_index, prior_probabilities[class_index]))
print()
print('Feature likelihoods')
words = sorted(word_2_num.keys())
col_width = max(map(len, words)) + 5
print(' ' * len('class 0 '), end='')
for word in words:
print(' ' * (col_width - len(word)), word, end=' ')
print()
for class_index in sorted(word_likelihoods_per_class.keys()):
word_likelihoods = word_likelihoods_per_class[class_index]
print('class {}'.format(class_index, ''), end=' ' * 3)
for word in words:
format_string = ' ' * (col_width - 7) + '{:.6f}'
print(format_string.format(word_likelihoods.get(word_2_num[word], 0.0)), end=' ')
print()
print()
evaluate('Test.txt', word_2_num, prior_probabilities, word_likelihoods_per_class)
Revise this Paste