functions

This commit is contained in:
Pavak Paul 2023-02-18 23:50:56 +05:30
parent fff21d97b8
commit 354f6ba752

68
app.py
View File

@ -0,0 +1,68 @@
import requests
from bs4 import BeautifulSoup
import os
# mainURL = input('Please enter the URL of eGyanKosh page\n: ')
baseURL = 'https://egyankosh.ac.in'
mainURL = 'https://egyankosh.ac.in/handle/123456789/404'
print(mainURL)
def getDataFromURL(url):
r = requests.get(url, verify=False)
if r.status_code != 200:
assert False
return r.text
def stripEmptySpace(html):
html = str(html)
html = "".join(line.strip() for line in html.split("\n"))
return BeautifulSoup(html, 'html.parser')
def getPageInfo(html):
soup = BeautifulSoup(html, 'html.parser')
pageTitle = soup.h2.text.split('\n')[0]
pageType = soup.select(".col-md-8 small")[0].string
return {'pageTitle': pageTitle, 'pageType': pageType}
def getSubPages(html, baseUrl = baseURL):
soup = BeautifulSoup(html, 'html.parser')
table = soup.select('.col-md-9 .list-group')[0]
rows = table.select(".list-group-item.row")
subpages = []
for row in rows:
row = stripEmptySpace(row)
info = {
'name': row.string,
'url': baseURL + row.a.get('href')
}
subpages.append(info)
return subpages
def getDownloadPageLinks(html, baseURL = baseURL):
soup = BeautifulSoup(html, 'html.parser')
anchors = soup.table.findAll('a')
links = []
for anchor in anchors:
link = {
'name': anchor.string,
'url': baseURL + anchor.get('href')
}
links.append(link)
return links
def downloadPdf(name, pageHtml, baseURL = baseURL):
soup = BeautifulSoup(pageHtml, 'html.parser')
pdfUrl= baseURL + soup.select(".break-all a")[0].get('href')
fileName = name+'.pdf'
print({'url': pdfUrl, 'fileName': fileName})
# print(getPageInfo(getDataFromURL(mainURL)))
# collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576')
# print(getDownloadPageLinks(collectionPage))
# downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976')
# downloadPdf('Unit-4 Mathematical Induction', downloadPage)