commit 7f3f99f24bb6466eb54fa66762c00dd9a8df7808 Author: sleptworld Date: Sat Mar 19 22:04:16 2022 +0800 first commit diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..8eb50f8 --- /dev/null +++ b/app/app.py @@ -0,0 +1,40 @@ +import asyncio +from spider.parse import ParseConnector, FrontItem +import spider.spider as spider + + +class Application: + def __init__( + self, + tasks, + parseConnector: ParseConnector = None, + ): + + self.__parser_loop = asyncio.new_event_loop() + + if parseConnector is None: + self.parseConnector = ParseConnector(loop=self.__parser_loop) + else: + self.parseConnector = parseConnector + + if isinstance(tasks, list): + self.type = "multi_spider" + self.tasks = {} + + for task in tasks: + assert isinstance(task, spider.SpiderTas) + self.tasks[task.tag] = task.get_tasks() + + elif isinstance(tasks, spider.SpiderTask): + self.type = "single_spider" + self.tasks = {tasks.tag: task.get_tasks()} + else: + raise TypeError + + async def start(self): + self.parseConnector.start_parse() + for r in asyncio.as_completed(self.tasks): + completed = await r + await self.parseConnector.add( + FrontItem(completed.type, completed.content, completed.cb) + ) diff --git a/main.py b/main.py new file mode 100644 index 0000000..82ac057 --- /dev/null +++ b/main.py @@ -0,0 +1,25 @@ +from app import Application +from spider.spider import SpiderTask +from spider.request import Requests +import asyncio + + +def main(): + tasks = [ + SpiderTask( + "news", + Requests( + "http://www.baidu.com", + [], + "get", + {}, + ), + ), + ] + + app = Application(tasks=tasks) + asyncio.run(app.start()) + + +if __name__ == "__main__": + main() diff --git a/spider/__init__.py b/spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spider/parse.py b/spider/parse.py new file mode 100644 index 0000000..df3b515 --- /dev/null +++ b/spider/parse.py @@ -0,0 +1,55 @@ +import threading +import asyncio +from collections.abc import Callable + + +class OutPutItem: + def __init__(self): + pass + + +class FrontItem: + def __init__( + self, + contentType: str, + content: str, + parserFunction: Callable[[str], OutPutItem], + ): + self.type = contentType + self.content = content + self.parserFunction = parserFunction + + +class ParseConnector: + def __init__(self, loop, tr=None): + self.__queue = asyncio.Queue() + self.__loop = loop + if tr is None: + self.__threading = threading.Thread( + target=__NewTr, + args=(loop,), + ) + self.__threading.daemon = True + + else: + self.__threading = tr + + self.__threading.start() + + async def add(self, item: FrontItem): + await self.__queue.put(item) + + async def parsing(self): + while True: + item: FrontItem = await self.__queue.get() + item.parserFunction(item.content) + self.__queue.task_done() + await self.__queue.join() + + def start_parse(self): + asyncio.run_coroutine_threadsafe(self.parsing(), self.__loop) + + +def __NewTr(loop: asyncio.AbstractEventLoop): + asyncio.set_event_loop(loop) + loop.run_forever() diff --git a/spider/request.py b/spider/request.py new file mode 100644 index 0000000..5ba53a3 --- /dev/null +++ b/spider/request.py @@ -0,0 +1,13 @@ +from typing import List, Dict + + +class Requests: + def __init__( + self, + url: str, + group: List[str], + method: str, + ): + self.root = url + self.method = method + self.group = group diff --git a/spider/spider.py b/spider/spider.py new file mode 100644 index 0000000..6330f19 --- /dev/null +++ b/spider/spider.py @@ -0,0 +1,60 @@ +import asyncio +import aiohttp +from request import Requests +import urllib +from typing import List + + +class SpiderTask: + def __init__( + self, + tag: str, + req: Requests, + headers, + cookies=None, + proxy=None, + parser=None, + ): + self.tag = tag + self.req = req + self.session = aiohttp.ClientSession( + headers=headers, + cookies=cookies, + ) + + self.proxy = proxy + + tmp = [] + if len(req.group) != 0: + for r in req.group: + tmp.append( + asyncio.create_task( + aiohttp.request( + req.method, + url=urllib.parse.urljoin(req.root, r), + ) + ) + ) + + self.__tasks: List[asyncio.Task] = tmp + if isinstance(parser, list) and len(parser) != len(req.group): + raise TypeError() + self.parser = parser + + def delete_task(self, r: int): + self.__tasks.pop(r) + + def add_task(self, req: Requests): + if len(req.group) != 0: + for r in req.group: + self.__tasks.append( + asyncio.create_task( + aiohttp.request( + req.method, + url=urllib.parse.urljoin(req.root, r), + ) + ) + ) + + def get_tasks(self): + return self.tasks