-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler_based_on_event.py
More file actions
105 lines (85 loc) · 3 KB
/
crawler_based_on_event.py
File metadata and controls
105 lines (85 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import urllib.parse
import socket
import re
from selectors import *
seen_urls = set('/')
urls_todo = set('/')
selector = DefaultSelector()
stopped = False
class Fetcher():
def __init__(self,url):
self.url = url
self.response = b''
self.sock = None
def fetch(self):
self.sock = socket.socket()
self.sock.setblocking(False)
try:
self.sock.connect(('localhost',3000))
except BlockingIOError:
pass
selector.register(self.sock.fileno(), EVENT_WRITE,self.connected)
def connected(self,key,mask):
selector.unregister(key.fd)
get = 'GET {} HTTP/1.1\r\nHost:localhost\r\n\r\n'.format(self.url)
self.sock.send(get.encode('ascii'))
selector.register(key.fd,EVENT_READ,self.read_response)
def read_response(self,key,mask):
global stopped
chunk = self.sock.recv(4096)
if chunk:
self.response += chunk
else:
selector.unregister(key.fd)
links = self.parseLink(self.url, self.response)
for link in links.difference(seen_urls):
urls_todo.add(link)
Fetcher(link).fetch()
seen_urls.update(links)
urls_todo.remove(self.url)
if not urls_todo:
stopped = True
print(self.url)
def parseLink(self,current_url,response):
if not response:
print("error: {}".format(current_url))
return set()
if not self.isHtml(response):
return set()
response = response.decode('utf-8')
body = self.extractBody(response)
in_links = re.findall(r'''(?i)href=['"]+[^\s#"'<>]+''',body)
links_set = set()
for link in in_links:
_,part_url = re.split(r'''['"]''',link)
if not part_url:
return set()
whole_url = urllib.parse.urljoin(current_url,part_url)
url_parse_result = urllib.parse.urlparse(whole_url)
# if url_parse_result.scheme in ['http','https']:
# continue
hostname = url_parse_result.hostname
if hostname :
continue
if url_parse_result.path :
links_set.add(url_parse_result.path)
return links_set
def isHtml(self,text):
header,_ = text.split(b'\r\n\r\n',1)
header_dict = dict(h.split(b": ",1) for h in header.split(b"\r\n")[1:])
is_html = header_dict.get(b'Content-type')
if not is_html:
return False
else:
return is_html.startswith(b'text/html')
def extractBody(self, html):
_,body = html.split('\r\n\r\n',1)
return body
if __name__ == "__main__":
Fetcher('/').fetch()
while not stopped:
event = selector.select()
for event_key, event_mask in event:
callback = event_key.data
callback(event_key,event_mask)
print("Done")