binance-announcements-scrap.../Data Scraping for Binance Announcements.ipynb
2019-09-07 17:59:00 +02:00

215 lines
5.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Scraping for Binance Announcements\n",
"Beta version. Modified on 07-09-2019"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import all the needed packages:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"import bs4 as bs\n",
"import urllib.request\n",
"import tweepy, os"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Twitter app"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"consumer_key = os.environ.get('TW_CONSUMER_KEY')\n",
"consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n",
"access_token = os.environ.get('TW_ACCESS_TOKEN')\n",
"access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n",
"# authentication of consumer key and secret\n",
"auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
"# authentication of access token and secret\n",
"auth.set_access_token(access_token, access_token_secret)\n",
"api = tweepy.API(auth)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"old_urls, news_urls = [], []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the function to extract the information from the webpage and get the matchings"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def extract_binance(main_webpage, key_words):\n",
" final_item, final_list = [], []\n",
" sauce = urllib.request.urlopen(main_webpage).read()\n",
" soup = bs.BeautifulSoup(sauce, 'lxml')\n",
" list = soup.find_all('li', class_ = 'article-list-item')\n",
" for article in list:\n",
" article_text = article.get_text().replace('\\n', '')\n",
" for item in key_words:\n",
" if item in article_text:\n",
" final_item.append(article_text)\n",
" final_item.append('https://www.binance.com' + article.find('a').get('href'))\n",
" final_list.append(final_item)\n",
" final_item = [] # Reset once is in the final_list to not get duplicates\n",
" return final_list"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the first pass"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 38.2 ms, sys: 3.74 ms, total: 41.9 ms\n",
"Wall time: 147 ms\n"
]
}
],
"source": [
"%%time\n",
"main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n",
"old_urls = extract_binance(main_webpage, key_words)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['Binance Lists Chiliz (CHZ)',\n",
" 'https://www.binance.com/en/support/articles/360033377831'],\n",
" ['Binance Completes Perlin Lottery Draw and Will Open Trading For PERL',\n",
" 'https://www.binance.com/en/support/articles/360032900851'],\n",
" ['Binance Lists Second BEP2 Community Listing Project - TomoChain (TOMO)',\n",
" 'https://www.binance.com/en/support/articles/360032514812'],\n",
" ['Introducing the Band Protocol (BAND) Token Sale on Binance Launchpad',\n",
" 'https://www.binance.com/en/support/articles/360033102832'],\n",
" ['Pepito', 'Fulanito']]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_urls = extract_binance(main_webpage, key_words)\n",
"a = ['Pepito', 'Fulanito']\n",
"new_urls.append(a)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pepito\n"
]
}
],
"source": [
"for item in new_urls:\n",
" if item not in old_urls:\n",
" msg = item[0]\n",
" print(msg)\n",
" #api.update_status('Testing')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}