Thanks for putting this together! I'm attempting to run this code, but can only get as far as "Scraping FB Posts from Business Times." When I run the section, it prints 11 weblinks, but the last weblink leads to a missing page on Business Times. This causes the script to give up with an HTTP error (output below). Any guidance is much appreciated!
HTTPError Traceback (most recent call last)
in
18 try:
---> 19 link_page = urlopen(url).read()
20 except:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
HTTPError: HTTP Error 403: Forbidden
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
in
20 except:
21 url = url[:-2]
---> 22 link_page = urlopen(url).read()
23 link_soup = BeautifulSoup(link_page)
24 sentences = link_soup.findAll("p")
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
`
Thanks for putting this together! I'm attempting to run this code, but can only get as far as "Scraping FB Posts from Business Times." When I run the section, it prints 11 weblinks, but the last weblink leads to a missing page on Business Times. This causes the script to give up with an HTTP error (output below). Any guidance is much appreciated!
`...
14 Mar 2019 https://www.businesstimes.com.sg/technology/massive-outage-adds-to-growing-facebook-woes
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-faces-criminal-probe-of-data-deals-report
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-instagram-outage-spreads-to-users-around-the-globe
HTTPError Traceback (most recent call last)
in
18 try:
---> 19 link_page = urlopen(url).read()
20 except:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
HTTPError: HTTP Error 403: Forbidden
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
in
20 except:
21 url = url[:-2]
---> 22 link_page = urlopen(url).read()
23 link_soup = BeautifulSoup(link_page)
24 sentences = link_soup.findAll("p")
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
`