functions

2023-02-18 23:50:56 +05:30 · 2023-02-18 23:50:56 +05:30 · 354f6ba752
commit 354f6ba752
parent fff21d97b8
1 changed files with 68 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -0,0 +1,68 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+# mainURL = input('Please enter the URL of eGyanKosh page\n: ')
+baseURL = 'https://egyankosh.ac.in'
+mainURL = 'https://egyankosh.ac.in/handle/123456789/404'
+print(mainURL)
+
+def getDataFromURL(url):
+    r = requests.get(url, verify=False)
+    if r.status_code != 200:
+        assert False
+    return r.text
+
+def stripEmptySpace(html):
+    html = str(html)
+    html = "".join(line.strip() for line in html.split("\n"))
+    return BeautifulSoup(html, 'html.parser')
+
+def getPageInfo(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    pageTitle = soup.h2.text.split('\n')[0]
+    pageType = soup.select(".col-md-8 small")[0].string
+    return {'pageTitle': pageTitle, 'pageType': pageType}
+
+def getSubPages(html, baseUrl = baseURL):
+    soup = BeautifulSoup(html, 'html.parser')
+    table = soup.select('.col-md-9 .list-group')[0]
+    rows = table.select(".list-group-item.row")
+    subpages = []
+    for row in rows:
+        row = stripEmptySpace(row)
+        info = {
+            'name': row.string,
+            'url': baseURL + row.a.get('href')
+        }
+        subpages.append(info)
+    return subpages
+
+def getDownloadPageLinks(html, baseURL = baseURL):
+    soup = BeautifulSoup(html, 'html.parser')
+    anchors = soup.table.findAll('a')
+    links = []
+    for anchor in anchors:
+        link = {
+            'name': anchor.string,
+            'url': baseURL + anchor.get('href')
+        }
+        links.append(link)
+    return links
+
+def downloadPdf(name, pageHtml, baseURL = baseURL):
+    soup = BeautifulSoup(pageHtml, 'html.parser')
+    pdfUrl= baseURL + soup.select(".break-all a")[0].get('href')
+    fileName = name+'.pdf'
+    print({'url': pdfUrl, 'fileName': fileName})
+
+
+
+
+# print(getPageInfo(getDataFromURL(mainURL)))
+
+# collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576')
+# print(getDownloadPageLinks(collectionPage))
+
+# downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976')
+# downloadPdf('Unit-4 Mathematical Induction', downloadPage)