From 354f6ba752ea2cf7e33273a2813c56983dfe9406 Mon Sep 17 00:00:00 2001 From: Pavak Paul Date: Sat, 18 Feb 2023 23:50:56 +0530 Subject: [PATCH] functions --- app.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/app.py b/app.py index e69de29..eb9bd0b 100644 --- a/app.py +++ b/app.py @@ -0,0 +1,68 @@ +import requests +from bs4 import BeautifulSoup +import os + +# mainURL = input('Please enter the URL of eGyanKosh page\n: ') +baseURL = 'https://egyankosh.ac.in' +mainURL = 'https://egyankosh.ac.in/handle/123456789/404' +print(mainURL) + +def getDataFromURL(url): + r = requests.get(url, verify=False) + if r.status_code != 200: + assert False + return r.text + +def stripEmptySpace(html): + html = str(html) + html = "".join(line.strip() for line in html.split("\n")) + return BeautifulSoup(html, 'html.parser') + +def getPageInfo(html): + soup = BeautifulSoup(html, 'html.parser') + pageTitle = soup.h2.text.split('\n')[0] + pageType = soup.select(".col-md-8 small")[0].string + return {'pageTitle': pageTitle, 'pageType': pageType} + +def getSubPages(html, baseUrl = baseURL): + soup = BeautifulSoup(html, 'html.parser') + table = soup.select('.col-md-9 .list-group')[0] + rows = table.select(".list-group-item.row") + subpages = [] + for row in rows: + row = stripEmptySpace(row) + info = { + 'name': row.string, + 'url': baseURL + row.a.get('href') + } + subpages.append(info) + return subpages + +def getDownloadPageLinks(html, baseURL = baseURL): + soup = BeautifulSoup(html, 'html.parser') + anchors = soup.table.findAll('a') + links = [] + for anchor in anchors: + link = { + 'name': anchor.string, + 'url': baseURL + anchor.get('href') + } + links.append(link) + return links + +def downloadPdf(name, pageHtml, baseURL = baseURL): + soup = BeautifulSoup(pageHtml, 'html.parser') + pdfUrl= baseURL + soup.select(".break-all a")[0].get('href') + fileName = name+'.pdf' + print({'url': pdfUrl, 'fileName': fileName}) + + + + +# print(getPageInfo(getDataFromURL(mainURL))) + +# collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576') +# print(getDownloadPageLinks(collectionPage)) + +# downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976') +# downloadPdf('Unit-4 Mathematical Induction', downloadPage) \ No newline at end of file