first commit

This commit is contained in:
sleptworld 2022-03-19 22:04:16 +08:00
commit 7f3f99f24b
7 changed files with 193 additions and 0 deletions

0
app/__init__.py Normal file
View File

40
app/app.py Normal file
View File

@ -0,0 +1,40 @@
import asyncio
from spider.parse import ParseConnector, FrontItem
import spider.spider as spider
class Application:
def __init__(
self,
tasks,
parseConnector: ParseConnector = None,
):
self.__parser_loop = asyncio.new_event_loop()
if parseConnector is None:
self.parseConnector = ParseConnector(loop=self.__parser_loop)
else:
self.parseConnector = parseConnector
if isinstance(tasks, list):
self.type = "multi_spider"
self.tasks = {}
for task in tasks:
assert isinstance(task, spider.SpiderTas)
self.tasks[task.tag] = task.get_tasks()
elif isinstance(tasks, spider.SpiderTask):
self.type = "single_spider"
self.tasks = {tasks.tag: task.get_tasks()}
else:
raise TypeError
async def start(self):
self.parseConnector.start_parse()
for r in asyncio.as_completed(self.tasks):
completed = await r
await self.parseConnector.add(
FrontItem(completed.type, completed.content, completed.cb)
)

25
main.py Normal file
View File

@ -0,0 +1,25 @@
from app import Application
from spider.spider import SpiderTask
from spider.request import Requests
import asyncio
def main():
tasks = [
SpiderTask(
"news",
Requests(
"http://www.baidu.com",
[],
"get",
{},
),
),
]
app = Application(tasks=tasks)
asyncio.run(app.start())
if __name__ == "__main__":
main()

0
spider/__init__.py Normal file
View File

55
spider/parse.py Normal file
View File

@ -0,0 +1,55 @@
import threading
import asyncio
from collections.abc import Callable
class OutPutItem:
def __init__(self):
pass
class FrontItem:
def __init__(
self,
contentType: str,
content: str,
parserFunction: Callable[[str], OutPutItem],
):
self.type = contentType
self.content = content
self.parserFunction = parserFunction
class ParseConnector:
def __init__(self, loop, tr=None):
self.__queue = asyncio.Queue()
self.__loop = loop
if tr is None:
self.__threading = threading.Thread(
target=__NewTr,
args=(loop,),
)
self.__threading.daemon = True
else:
self.__threading = tr
self.__threading.start()
async def add(self, item: FrontItem):
await self.__queue.put(item)
async def parsing(self):
while True:
item: FrontItem = await self.__queue.get()
item.parserFunction(item.content)
self.__queue.task_done()
await self.__queue.join()
def start_parse(self):
asyncio.run_coroutine_threadsafe(self.parsing(), self.__loop)
def __NewTr(loop: asyncio.AbstractEventLoop):
asyncio.set_event_loop(loop)
loop.run_forever()

13
spider/request.py Normal file
View File

@ -0,0 +1,13 @@
from typing import List, Dict
class Requests:
def __init__(
self,
url: str,
group: List[str],
method: str,
):
self.root = url
self.method = method
self.group = group

60
spider/spider.py Normal file
View File

@ -0,0 +1,60 @@
import asyncio
import aiohttp
from request import Requests
import urllib
from typing import List
class SpiderTask:
def __init__(
self,
tag: str,
req: Requests,
headers,
cookies=None,
proxy=None,
parser=None,
):
self.tag = tag
self.req = req
self.session = aiohttp.ClientSession(
headers=headers,
cookies=cookies,
)
self.proxy = proxy
tmp = []
if len(req.group) != 0:
for r in req.group:
tmp.append(
asyncio.create_task(
aiohttp.request(
req.method,
url=urllib.parse.urljoin(req.root, r),
)
)
)
self.__tasks: List[asyncio.Task] = tmp
if isinstance(parser, list) and len(parser) != len(req.group):
raise TypeError()
self.parser = parser
def delete_task(self, r: int):
self.__tasks.pop(r)
def add_task(self, req: Requests):
if len(req.group) != 0:
for r in req.group:
self.__tasks.append(
asyncio.create_task(
aiohttp.request(
req.method,
url=urllib.parse.urljoin(req.root, r),
)
)
)
def get_tasks(self):
return self.tasks