Delete the jupyter notebook and upload a .py file
This commit is contained in:
parent
9fb9e4d6c2
commit
aaab960e4f
@ -1,195 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Scraping for Binance Announcements\n",
|
||||
"Beta version. Modified on 07-09-2019"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Import all the needed packages:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import bs4 as bs\n",
|
||||
"import urllib.request\n",
|
||||
"import tweepy, os, time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Twitter app"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"consumer_key = os.environ.get('TW_CONSUMER_KEY')\n",
|
||||
"consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n",
|
||||
"access_token = os.environ.get('TW_ACCESS_TOKEN')\n",
|
||||
"access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n",
|
||||
"# authentication of consumer key and secret\n",
|
||||
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
|
||||
"# authentication of access token and secret\n",
|
||||
"auth.set_access_token(access_token, access_token_secret)\n",
|
||||
"api = tweepy.API(auth)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"old_urls, news_urls = [], []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Create the function to extract the information from the webpage and get the matchings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_binance(main_webpage, key_words):\n",
|
||||
" final_item, final_list = [], []\n",
|
||||
" sauce = urllib.request.urlopen(main_webpage).read()\n",
|
||||
" soup = bs.BeautifulSoup(sauce, 'lxml')\n",
|
||||
" list = soup.find_all('li', class_ = 'article-list-item')\n",
|
||||
" for article in list:\n",
|
||||
" article_text = article.get_text().replace('\\n', '')\n",
|
||||
" for item in key_words:\n",
|
||||
" if item in article_text:\n",
|
||||
" final_item.append(article_text)\n",
|
||||
" final_item.append('https://www.binance.com' + article.find('a').get('href'))\n",
|
||||
" final_list.append(final_item)\n",
|
||||
" final_item = [] # Reset once is in the final_list to not get duplicates\n",
|
||||
" return final_list"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Get the first pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 30.9 ms, sys: 349 µs, total: 31.2 ms\n",
|
||||
"Wall time: 77.1 ms\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n",
|
||||
"old_urls = extract_binance(main_webpage, key_words)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Loop pass and get the new announcements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Done for now. Time to go to sleep mate!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Loop pass - Watchdog mode\n",
|
||||
"while True:\n",
|
||||
" new_urls = extract_binance(main_webpage, key_words)\n",
|
||||
" for item in new_urls:\n",
|
||||
" if item not in old_urls:\n",
|
||||
" msg = item[0] + '\\n' + item[1]\n",
|
||||
" api.update_status(msg)\n",
|
||||
" print('Done for now. Time to go to sleep mate!')\n",
|
||||
" time.sleep(900) # Check every 15 min"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user