functions
This commit is contained in:
parent
fff21d97b8
commit
354f6ba752
68
app.py
68
app.py
@ -0,0 +1,68 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
# mainURL = input('Please enter the URL of eGyanKosh page\n: ')
|
||||
baseURL = 'https://egyankosh.ac.in'
|
||||
mainURL = 'https://egyankosh.ac.in/handle/123456789/404'
|
||||
print(mainURL)
|
||||
|
||||
def getDataFromURL(url):
|
||||
r = requests.get(url, verify=False)
|
||||
if r.status_code != 200:
|
||||
assert False
|
||||
return r.text
|
||||
|
||||
def stripEmptySpace(html):
|
||||
html = str(html)
|
||||
html = "".join(line.strip() for line in html.split("\n"))
|
||||
return BeautifulSoup(html, 'html.parser')
|
||||
|
||||
def getPageInfo(html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
pageTitle = soup.h2.text.split('\n')[0]
|
||||
pageType = soup.select(".col-md-8 small")[0].string
|
||||
return {'pageTitle': pageTitle, 'pageType': pageType}
|
||||
|
||||
def getSubPages(html, baseUrl = baseURL):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
table = soup.select('.col-md-9 .list-group')[0]
|
||||
rows = table.select(".list-group-item.row")
|
||||
subpages = []
|
||||
for row in rows:
|
||||
row = stripEmptySpace(row)
|
||||
info = {
|
||||
'name': row.string,
|
||||
'url': baseURL + row.a.get('href')
|
||||
}
|
||||
subpages.append(info)
|
||||
return subpages
|
||||
|
||||
def getDownloadPageLinks(html, baseURL = baseURL):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
anchors = soup.table.findAll('a')
|
||||
links = []
|
||||
for anchor in anchors:
|
||||
link = {
|
||||
'name': anchor.string,
|
||||
'url': baseURL + anchor.get('href')
|
||||
}
|
||||
links.append(link)
|
||||
return links
|
||||
|
||||
def downloadPdf(name, pageHtml, baseURL = baseURL):
|
||||
soup = BeautifulSoup(pageHtml, 'html.parser')
|
||||
pdfUrl= baseURL + soup.select(".break-all a")[0].get('href')
|
||||
fileName = name+'.pdf'
|
||||
print({'url': pdfUrl, 'fileName': fileName})
|
||||
|
||||
|
||||
|
||||
|
||||
# print(getPageInfo(getDataFromURL(mainURL)))
|
||||
|
||||
# collectionPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/1576')
|
||||
# print(getDownloadPageLinks(collectionPage))
|
||||
|
||||
# downloadPage = getDataFromURL('https://egyankosh.ac.in/handle/123456789/10976')
|
||||
# downloadPdf('Unit-4 Mathematical Induction', downloadPage)
|
||||
Loading…
Reference in New Issue
Block a user