Script stops at "Scraping FB Posts..." with HTTPError

Thanks for putting this together! I'm attempting to run this code, but can only get as far as "Scraping FB Posts from Business Times." When I run the section, it prints 11 weblinks, but the last weblink leads to a missing page on Business Times. This causes the script to give up with an HTTP error (output below). Any guidance is much appreciated!

`...
14 Mar 2019 https://www.businesstimes.com.sg/technology/massive-outage-adds-to-growing-facebook-woes
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-faces-criminal-probe-of-data-deals-report
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-instagram-outage-spreads-to-users-around-the-globe
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-3-d57752a5a034> in <module>
     18         try:
---> 19             link_page = urlopen(url).read()
     20         except:

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 

HTTPError: HTTP Error 403: Forbidden

During handling of the above exception, another exception occurred:

HTTPError                                 Traceback (most recent call last)
<ipython-input-3-d57752a5a034> in <module>
     20         except:
     21             url = url[:-2]
---> 22             link_page = urlopen(url).read()
     23         link_soup = BeautifulSoup(link_page)
     24         sentences = link_soup.findAll("p")

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    530         for processor in self.process_response.get(protocol, []):
    531             meth = getattr(processor, meth_name)
--> 532             response = meth(req, response)
    533 
    534         return response

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
    640         if not (200 <= code < 300):
    641             response = self.parent.error(
--> 642                 'http', request, response, code, msg, hdrs)
    643 
    644         return response

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
    568         if http_err:
    569             args = (dict, 'default', 'http_error_default') + orig_args
--> 570             return self._call_chain(*args)
    571 
    572 # XXX probably also want an abstract factory that knows when it makes

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    648 class HTTPDefaultErrorHandler(BaseHandler):
    649     def http_error_default(self, req, fp, code, msg, hdrs):
--> 650         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    651 
    652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found
`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Script stops at "Scraping FB Posts..." with HTTPError #1

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Script stops at "Scraping FB Posts..." with HTTPError #1

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions