first commit
This commit is contained in:
commit
7f3f99f24b
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
40
app/app.py
Normal file
40
app/app.py
Normal file
@ -0,0 +1,40 @@
|
||||
import asyncio
|
||||
from spider.parse import ParseConnector, FrontItem
|
||||
import spider.spider as spider
|
||||
|
||||
|
||||
class Application:
|
||||
def __init__(
|
||||
self,
|
||||
tasks,
|
||||
parseConnector: ParseConnector = None,
|
||||
):
|
||||
|
||||
self.__parser_loop = asyncio.new_event_loop()
|
||||
|
||||
if parseConnector is None:
|
||||
self.parseConnector = ParseConnector(loop=self.__parser_loop)
|
||||
else:
|
||||
self.parseConnector = parseConnector
|
||||
|
||||
if isinstance(tasks, list):
|
||||
self.type = "multi_spider"
|
||||
self.tasks = {}
|
||||
|
||||
for task in tasks:
|
||||
assert isinstance(task, spider.SpiderTas)
|
||||
self.tasks[task.tag] = task.get_tasks()
|
||||
|
||||
elif isinstance(tasks, spider.SpiderTask):
|
||||
self.type = "single_spider"
|
||||
self.tasks = {tasks.tag: task.get_tasks()}
|
||||
else:
|
||||
raise TypeError
|
||||
|
||||
async def start(self):
|
||||
self.parseConnector.start_parse()
|
||||
for r in asyncio.as_completed(self.tasks):
|
||||
completed = await r
|
||||
await self.parseConnector.add(
|
||||
FrontItem(completed.type, completed.content, completed.cb)
|
||||
)
|
||||
25
main.py
Normal file
25
main.py
Normal file
@ -0,0 +1,25 @@
|
||||
from app import Application
|
||||
from spider.spider import SpiderTask
|
||||
from spider.request import Requests
|
||||
import asyncio
|
||||
|
||||
|
||||
def main():
|
||||
tasks = [
|
||||
SpiderTask(
|
||||
"news",
|
||||
Requests(
|
||||
"http://www.baidu.com",
|
||||
[],
|
||||
"get",
|
||||
{},
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
app = Application(tasks=tasks)
|
||||
asyncio.run(app.start())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
spider/__init__.py
Normal file
0
spider/__init__.py
Normal file
55
spider/parse.py
Normal file
55
spider/parse.py
Normal file
@ -0,0 +1,55 @@
|
||||
import threading
|
||||
import asyncio
|
||||
from collections.abc import Callable
|
||||
|
||||
|
||||
class OutPutItem:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class FrontItem:
|
||||
def __init__(
|
||||
self,
|
||||
contentType: str,
|
||||
content: str,
|
||||
parserFunction: Callable[[str], OutPutItem],
|
||||
):
|
||||
self.type = contentType
|
||||
self.content = content
|
||||
self.parserFunction = parserFunction
|
||||
|
||||
|
||||
class ParseConnector:
|
||||
def __init__(self, loop, tr=None):
|
||||
self.__queue = asyncio.Queue()
|
||||
self.__loop = loop
|
||||
if tr is None:
|
||||
self.__threading = threading.Thread(
|
||||
target=__NewTr,
|
||||
args=(loop,),
|
||||
)
|
||||
self.__threading.daemon = True
|
||||
|
||||
else:
|
||||
self.__threading = tr
|
||||
|
||||
self.__threading.start()
|
||||
|
||||
async def add(self, item: FrontItem):
|
||||
await self.__queue.put(item)
|
||||
|
||||
async def parsing(self):
|
||||
while True:
|
||||
item: FrontItem = await self.__queue.get()
|
||||
item.parserFunction(item.content)
|
||||
self.__queue.task_done()
|
||||
await self.__queue.join()
|
||||
|
||||
def start_parse(self):
|
||||
asyncio.run_coroutine_threadsafe(self.parsing(), self.__loop)
|
||||
|
||||
|
||||
def __NewTr(loop: asyncio.AbstractEventLoop):
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_forever()
|
||||
13
spider/request.py
Normal file
13
spider/request.py
Normal file
@ -0,0 +1,13 @@
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class Requests:
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
group: List[str],
|
||||
method: str,
|
||||
):
|
||||
self.root = url
|
||||
self.method = method
|
||||
self.group = group
|
||||
60
spider/spider.py
Normal file
60
spider/spider.py
Normal file
@ -0,0 +1,60 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from request import Requests
|
||||
import urllib
|
||||
from typing import List
|
||||
|
||||
|
||||
class SpiderTask:
|
||||
def __init__(
|
||||
self,
|
||||
tag: str,
|
||||
req: Requests,
|
||||
headers,
|
||||
cookies=None,
|
||||
proxy=None,
|
||||
parser=None,
|
||||
):
|
||||
self.tag = tag
|
||||
self.req = req
|
||||
self.session = aiohttp.ClientSession(
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
)
|
||||
|
||||
self.proxy = proxy
|
||||
|
||||
tmp = []
|
||||
if len(req.group) != 0:
|
||||
for r in req.group:
|
||||
tmp.append(
|
||||
asyncio.create_task(
|
||||
aiohttp.request(
|
||||
req.method,
|
||||
url=urllib.parse.urljoin(req.root, r),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
self.__tasks: List[asyncio.Task] = tmp
|
||||
if isinstance(parser, list) and len(parser) != len(req.group):
|
||||
raise TypeError()
|
||||
self.parser = parser
|
||||
|
||||
def delete_task(self, r: int):
|
||||
self.__tasks.pop(r)
|
||||
|
||||
def add_task(self, req: Requests):
|
||||
if len(req.group) != 0:
|
||||
for r in req.group:
|
||||
self.__tasks.append(
|
||||
asyncio.create_task(
|
||||
aiohttp.request(
|
||||
req.method,
|
||||
url=urllib.parse.urljoin(req.root, r),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def get_tasks(self):
|
||||
return self.tasks
|
||||
Loading…
Reference in New Issue
Block a user