diff --git "a/Char_LSTM_Documentation.ipynb" "b/Char_LSTM_Documentation.ipynb" new file mode 100644--- /dev/null +++ "b/Char_LSTM_Documentation.ipynb" @@ -0,0 +1,2923 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import eikon as ek\n", + "import numpy as np\n", + "from IPython.display import HTML\n", + "import os\n", + "from time import sleep\n", + "from tqdm import tqdm\n", + "import sys\n", + "import datetime\n", + "from pytz import timezone # set timezone\n", + "# Ignore harmless warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "pd.set_option('display.max_rows', 1000)\n", + "pd.set_option('display.max_columns', 1000)\n", + "pd.set_option('display.width', 1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Get the data and news headlines for the symbols of Dow Jones, Nasdaq100, SP500.\n", + "2. Prepare the data\n", + "- PCO to intialize weights help in time computation reduction and global optima finding\n", + "- Denoising input data helps predict small price changes\n", + "3. Build Languauge model based on character embedding\n", + "- Epoch means one pass over the full training set\n", + "- Batch means that you use all your data to compute the gradient during one iteration.\n", + "- Mini-batch means you only take a subset of all your data during one iteration.\n", + "- In the context of SGD, \"Minibatch\" means that the gradient is calculated across the entire batch before updating weights. If you are not using a \"minibatch\", every training example in a \"batch\" updates the learning algorithm's parameters independently.\n", + "\n", + "- Batch Gradient Descent. Batch size is set to the total number of examples in the training dataset. (batch_size = len(train))\n", + "- Stochastic Gradient Descent. Batch size is set to one. (batch_size = 1)\n", + "- Minibatch Gradient Descent. Batch size is set to more than one and less than the total number of examples in the training dataset. (batch_size = 32,64...)\n", + "\n", + "\n", + "##### Benefits of Charac2vec:\n", + "- Having the character embedding, every single word’s vector can be formed even it is out-of-vocabulary words (optional). On the other hand, word embedding can only handle those seen words.\n", + "- Good fits for misspelling words\n", + "- handles infrequent words better than word2vec embedding as later one suffers from lack of enough training opportunity for those rare words\n", + "- Reduces model complexity and improving the performance (in terms of speed)\n", + "\n", + "##### Byte Level:\n", + "- When ASCII encoding is used, there is no difference between reading characters or bytes. The ASCII-way of encoding characters allows for 256 characters to be encoded and (surprise…) these 256 possible characters are stored as bytes.\n", + "4. Train Language Model and save embeddings representation and weights of the model.\n", + "5. Use weights and embeddings representation of language model to intialize new model that predict price direction movement, ultimaetly.\n", + "- Direction prediction correctness(DPC) will be used as final metric to evaluate on test data.\n", + "- DPC is %CorrectPredictions. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get all data " + ] + }, + { + "cell_type": "raw", + "metadata": { + "jupyter": { + "outputs_hidden": true + } + }, + "source": [ + "# https://github.com/Refinitiv-API-Samples/Article.EikonAPI.Python.NewsSentimentAnalysis\n", + "#LMITS : https://developers.refinitiv.com/eikon-apis/eikon-data-api/docs?content=49692&type=documentation_item\n", + "# 10,000 reuests per day --- 5 requests per second --- 50 MB per minute ---\n", + "# import logging\n", + "# logger = logging.getLogger('pyeikon')\n", + "# logger.setLevel(5)\n", + "\n", + "ek.set_app_key('#######################')\n", + "ek.get_port_number()\n", + "# Get Index Stock Constitutes, separate them by a space, use the expression to search news\n", + "df, err = ek.get_data('0#.SPX','TR.RIC')\n", + "instr = df['Instrument'].tolist()\n", + "news_search_expr = ' '.join(instr)\n", + "ek.get_news_headlines(news_search_expr)\n", + "\n", + "sp500_symbols = list(ek.get_data(['0#.SPX'], 'TR.RIC')[0]['RIC'])\n", + "dowjones_symboles = list(ek.get_data(['0#.DJI'], 'TR.RIC')[0]['RIC']) \n", + "nasdaq100_symbols = list(ek.get_data(['0#.NDX'], 'TR.RIC')[0]['RIC'])\n", + "unique_index_symbols = list(np.unique(sp500_symbols + dowjones_symboles + nasdaq100_symbols))\n", + "db = pd.HDFStore('news_db.h5')\n", + "\n", + "for t in tqdm(unique_index_symbols):\n", + " for start in pd.date_range(start = '2020-09-01', end = datetime.datetime.now(),normalize=True, tz = 'US/Eastern',freq = 'D'):\n", + " end = start + datetime.timedelta(days=1)\n", + " start = start.strftime('%Y-%m-%dT%H:%M:%S')\n", + " end = end.strftime('%Y-%m-%dT%H:%M:%S') \n", + " news = ek.get_news_headlines('R:%s AND Language:LEN' % t, \n", + " count= 100,\n", + " date_from= start,\n", + " date_to= end)\n", + " if len(news) != 0:\n", + " try:\n", + " news['ticker'] = t.strip('.')[0]\n", + " db.put(t.strip('.')[0], news, format = 'table',append = True, \n", + " data_columns = True, \n", + " min_itemsize={'text': 1000, 'sourceCode': 1000, 'storyId': 100},\n", + " encoding = 'UTF-8')\n", + " print(t+':'+start+':'+end)\n", + " except IOError as err:\n", + " print(\"I/O error: {0}\".format(err))\n", + " except:\n", + " print(\"Unexpected error:\", sys.exc_info()[0])\n", + " raise\n", + " sleep(0.25)\n", + " else:\n", + " print('No News Found: '+t+':'+start+':'+end)\n", + " pass\n", + " sleep(0.25)\n", + " sleep(1)\n", + "db.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "625478\n" + ] + } + ], + "source": [ + "length = 0\n", + "with pd.HDFStore('./news_update/news_db.h5', mode = 'r') as store:\n", + " for i in store.keys():\n", + " length += len(store[i])\n", + "print(length)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "unique_index_symbols.reverse()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2020-08-14 20:03:19+0000', tz='UTC')" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[:, 'versionCreated'].min()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2020-08-14 20:03:19+0000', tz='UTC')" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = db[unique_index_symbols[524].split('.')[0]]\n", + "df.versionCreated.min()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2020-07-27'" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=18)).strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "start: 2020-07-27T00:00:00 end: 2020-07-28T00:00:00\n", + "start: 2020-07-28T00:00:00 end: 2020-07-29T00:00:00\n", + "start: 2020-07-29T00:00:00 end: 2020-07-30T00:00:00\n", + "start: 2020-07-30T00:00:00 end: 2020-07-31T00:00:00\n", + "start: 2020-07-31T00:00:00 end: 2020-08-01T00:00:00\n", + "start: 2020-08-01T00:00:00 end: 2020-08-02T00:00:00\n", + "start: 2020-08-02T00:00:00 end: 2020-08-03T00:00:00\n", + "start: 2020-08-03T00:00:00 end: 2020-08-04T00:00:00\n", + "start: 2020-08-04T00:00:00 end: 2020-08-05T00:00:00\n", + "start: 2020-08-05T00:00:00 end: 2020-08-06T00:00:00\n", + "start: 2020-08-06T00:00:00 end: 2020-08-07T00:00:00\n", + "start: 2020-08-07T00:00:00 end: 2020-08-08T00:00:00\n", + "start: 2020-08-08T00:00:00 end: 2020-08-09T00:00:00\n", + "start: 2020-08-09T00:00:00 end: 2020-08-10T00:00:00\n", + "start: 2020-08-10T00:00:00 end: 2020-08-11T00:00:00\n", + "start: 2020-08-11T00:00:00 end: 2020-08-12T00:00:00\n", + "start: 2020-08-12T00:00:00 end: 2020-08-13T00:00:00\n", + "start: 2020-08-13T00:00:00 end: 2020-08-14T00:00:00\n" + ] + } + ], + "source": [ + "end_at = (df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')\n", + "begin_from = (df.loc[:, 'versionCreated'].min() - datetime.timedelta(days=18)).strftime('%Y-%m-%d')\n", + "for start in pd.date_range(start = begin_from, end = end_at,normalize=True, tz = 'US/Eastern',freq = 'D'):\n", + " end = start + datetime.timedelta(days=1)\n", + " start = start.strftime('%Y-%m-%dT%H:%M:%S')\n", + " end = end.strftime('%Y-%m-%dT%H:%M:%S') \n", + " print('start: ',start, 'end: ', end)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "length = 0\n", + "for i in range(ord('A'), ord('Z') + 1):\n", + " length += len(db[chr(i)])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2020-07-27'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = db['AAPL.N'.split('.')[0][0]].\n", + "df.loc[:, 'versionCreated'].min().strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import json\n", + "import os.path\n", + "\n", + "if os.path.isfile('unique_tickers.json') == False:\n", + " with open('unique_tickers.json', 'w', encoding='utf-8') as f:\n", + " json.dump(unique_index_symbols, f, ensure_ascii=False, indent=4)\n", + " \n", + "with open('unique_tickers.json', 'r') as data_file:\n", + " unique_index_symbols = json.load(data_file)\n" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "r'' will treat input string as raw (with \\n)\n", + "\\W for all non-words i.e. all special characters *&^%$ etc excluding underscore _\n", + "+ will match zero to unlimited matches, similar to * (one to more)\n", + "| is logical OR\n", + "_ stands for underscore\n", + "\n", + "#re.sub(r'(?<=\\d)\\s(?=\\d)(?=\\d)', '.',i)) #add \".\" btween numbers\n", + "#(\"\".join(headline)).strip().encode('utf-8')\n", + "# import re\n", + "# re.findall(r'\\s[0-9]\\s[0-9][0-9]',x)\n", + "# re.sub(r'(?<=\\d)\\s(?=\\d)(?=\\d)', '.',x)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████| 525/525 [06:58<00:00, 1.26it/s]\n" + ] + } + ], + "source": [ + "import requests\n", + "import datetime\n", + "from time import sleep\n", + "from tqdm import tqdm\n", + "data = pd.DataFrame()\n", + "for t in tqdm(unique_index_symbols):\n", + " r = requests.get(\"https://finnhub.io/api/v1/press-releases?symbol=\"+t+\"&token=bsbhg3nrh5rd8518ll30\")\n", + " df = pd.DataFrame(r.json()['majorDevelopment'])\n", + " data = data.append(df, ignore_index=True)\n", + " sleep(0.5)\n", + "\n", + "data = data.rename(columns={'datetime': 'time'})\n", + "data.time = pd.to_datetime(data.time)\n", + "data.time = data.time.apply(lambda dt: int((dt - datetime.datetime(1970,1,1)).total_seconds()))\n", + "# data['description'] = data['description'].str.replace(r\"\\W+|_\", ' ')\n", + "# data['headline'] = data['headline'].str.replace(r\"\\W+|_\", ' ')\n", + "data.to_csv('news_test.csv', header = True, index = False)\n", + "#df.datetime = pd.to_datetime(df.datetime, unit='s')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "r2 = requests.get('https://finnhub.io/api/v1/company-news?symbol=DUK&from=2006-01-01&to=2006-01-02&token=bsbhg3nrh5rd8518ll30')\n", + "df = pd.DataFrame(r2.json())\n", + "# df = df[['datetime', 'headline', 'id','related', 'source', 'summary']]\n", + "# df['summary'] = df['summary'].str.replace(r\"\\W+|_\", ' ')\n", + "# df['headline'] = df['headline'].str.replace(r\"\\W+|_\", ' ')\n", + "# df = df.rename(columns={'related': 'symbol'})" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Timestamp('2019-09-02 00:00:00'),\n", + " Timestamp('2020-08-10 23:14:47'),\n", + " 32.88030888030888)" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Summary Statistics\n", + "pd.to_datetime(data.time, unit='s').min(), pd.to_datetime(data.time, unit='s').max(), data.groupby('symbol')['headline'].count().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Data for Model" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "from keras.utils import to_categorical, plot_model\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.models import Sequential\n", + "from keras.layers import LSTM, Dense, GRU, Embedding, Bidirectional, TimeDistributed\n", + "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using CPU\n", + "[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]\n" + ] + } + ], + "source": [ + "gpu_devices = tf.config.experimental.list_physical_devices('GPU')\n", + "\n", + "if gpu_devices:\n", + " print('Using GPU')\n", + " tf.config.experimental.set_memory_growth(gpu_devices[0], True)\n", + "else:\n", + " print('Using CPU')\n", + " print(tf.config.list_physical_devices(device_type=None))\n", + " tf.config.optimizer.set_jit(True) #@ " + ] + }, + { + "cell_type": "code", + "execution_count": 1100, + "metadata": {}, + "outputs": [], + "source": [ + "#clean the already cleaned headlines but pre/appen token()\n", + "#8 bits will let you express 2^8 == 256 possible values.Thus restrict characters to 0 < ord() < 256. \n", + "def clean_text(df, column):\n", + " import re \n", + " #(\"\".join(headline)).strip()\n", + " headline = []\n", + " for i in df[column].apply(lambda x: ''+x+'<\\s>'):\n", + " #headline.append(re.sub('[!,*)@=#({|}_‑–?^;:{|}˚~\\t\\n“—’”/_]',r'',i))\n", + " headline.append(i)\n", + " return headline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://realpython.com/python-encodings-guide/" + ] + }, + { + "cell_type": "code", + "execution_count": 1101, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[' ', '!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\\\', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '\\xa0', '£', '¥', '®', '´', 'µ', 'Æ', 'É', '×', 'Ø', 'à', 'á', 'ä', 'å', 'é', 'ê', 'ë', 'í', 'ï', 'ñ', 'ó', 'ö', 'ú', 'ü', 'ē', 'Š', '˚', '\\u200b', '\\u200c', '\\u200d', '‐', '‑', '–', '—', '’', '“', '”', '€', '™']\n", + "129\n" + ] + } + ], + "source": [ + "text = clean_text(data, 'headline')\n", + "txt = ''\n", + "# Count Unique Characters\n", + "for doc in text:\n", + " for s in doc:\n", + " txt += s\n", + "chars = sorted(set(txt))\n", + "print(chars)\n", + "print(len(chars)) #52+10+25" + ] + }, + { + "cell_type": "code", + "execution_count": 1102, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "  : 160\n", + "£ : 163\n", + "¥ : 165\n", + "® : 174\n", + "´ : 180\n", + "µ : 181\n", + "Æ : 198\n", + "É : 201\n", + "× : 215\n", + "Ø : 216\n", + "à : 224\n", + "á : 225\n", + "ä : 228\n", + "å : 229\n", + "é : 233\n", + "ê : 234\n", + "ë : 235\n", + "í : 237\n", + "ï : 239\n", + "ñ : 241\n", + "ó : 243\n", + "ö : 246\n", + "ú : 250\n", + "ü : 252\n", + "ē : 275\n", + "Š : 352\n", + "˚ : 730\n", + "​ : 8203\n", + "‌ : 8204\n", + "‍ : 8205\n", + "‐ : 8208\n", + "‑ : 8209\n", + "– : 8211\n", + "— : 8212\n", + "’ : 8217\n", + "“ : 8220\n", + "” : 8221\n", + "€ : 8364\n", + "™ : 8482\n" + ] + } + ], + "source": [ + "#reserve 0 for padding: check if it is found.\n", + "#NOTE: (len(bytes(i, encoding = 'utf-8')) > 1) == ord(i) > 127\n", + "for i in chars: \n", + " if (ord(i) == 0) | (ord(i) > 127):\n", + " print(i,':',ord(i))\n", + "# {char: i for i, char in zip(map(ord, chars),chars)}" + ] + }, + { + "cell_type": "code", + "execution_count": 1103, + "metadata": {}, + "outputs": [], + "source": [ + "def encode2bytes(text):\n", + " #text = tf.strings.unicode_split(text, 'UTF-8').to_list()\n", + " final_list = []\n", + " for sent in text:\n", + " temp_list = []\n", + " for char in sent:\n", + " if ord(char) < 128 :\n", + " temp_list.append(ord(char))\n", + " final_list.append(temp_list)\n", + " return final_list" + ] + }, + { + "cell_type": "code", + "execution_count": 1104, + "metadata": {}, + "outputs": [], + "source": [ + "b_text = encode2bytes(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 1105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "412" + ] + }, + "execution_count": 1105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_sentence_len = max([len(sentence) for sentence in b_text])\n", + "max_sentence_len" + ] + }, + { + "cell_type": "code", + "execution_count": 1112, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes\n", + "Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 65, 110, 110, 111, 117, 110, 99, 101, 115, 32, 80, 114, 105, 99, 105, 110, 103, 32, 79, 102, 32, 36, 53, 48, 48, 32, 77, 105, 108, 108, 105, 111, 110, 32, 79, 102, 32, 83, 101, 110, 105, 111, 114, 32, 78, 111, 116, 101, 115, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed\n", + "Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 70, 105, 108, 101, 115, 32, 70, 111, 114, 32, 80, 111, 116, 101, 110, 116, 105, 97, 108, 32, 83, 101, 110, 105, 111, 114, 32, 78, 111, 116, 101, 115, 32, 79, 102, 102, 101, 114, 105, 110, 103, 32, 83, 105, 122, 101, 32, 78, 111, 116, 32, 68, 105, 115, 99, 108, 111, 115, 101, 100, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Q2 GAAP Earnings Per Share $0.32\n", + "Agilent Technologies Q2 GAAP Earnings Per Share $0.32<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 81, 50, 32, 71, 65, 65, 80, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 80, 101, 114, 32, 83, 104, 97, 114, 101, 32, 36, 48, 46, 51, 50, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent PD-L1 Assay Receives FDA Approval For Use As Companion Diagnostic\n", + "Agilent PD-L1 Assay Receives FDA Approval For Use As Companion Diagnostic<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 80, 68, 45, 76, 49, 32, 65, 115, 115, 97, 121, 32, 82, 101, 99, 101, 105, 118, 101, 115, 32, 70, 68, 65, 32, 65, 112, 112, 114, 111, 118, 97, 108, 32, 70, 111, 114, 32, 85, 115, 101, 32, 65, 115, 32, 67, 111, 109, 112, 97, 110, 105, 111, 110, 32, 68, 105, 97, 103, 110, 111, 115, 116, 105, 99, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "FDA Approves Nivolumab Plus Ipilimumab For First-Line Metastatic Non-Small Cell Lung Cancer\n", + "FDA Approves Nivolumab Plus Ipilimumab For First-Line Metastatic Non-Small Cell Lung Cancer<\\s>\n", + "[60, 115, 62, 70, 68, 65, 32, 65, 112, 112, 114, 111, 118, 101, 115, 32, 78, 105, 118, 111, 108, 117, 109, 97, 98, 32, 80, 108, 117, 115, 32, 73, 112, 105, 108, 105, 109, 117, 109, 97, 98, 32, 70, 111, 114, 32, 70, 105, 114, 115, 116, 45, 76, 105, 110, 101, 32, 77, 101, 116, 97, 115, 116, 97, 116, 105, 99, 32, 78, 111, 110, 45, 83, 109, 97, 108, 108, 32, 67, 101, 108, 108, 32, 76, 117, 110, 103, 32, 67, 97, 110, 99, 101, 114, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Gets FDA Approval For Pd-L1 Companion Diagnostic On Dako Omnis\n", + "Agilent Gets FDA Approval For Pd-L1 Companion Diagnostic On Dako Omnis<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 71, 101, 116, 115, 32, 70, 68, 65, 32, 65, 112, 112, 114, 111, 118, 97, 108, 32, 70, 111, 114, 32, 80, 100, 45, 76, 49, 32, 67, 111, 109, 112, 97, 110, 105, 111, 110, 32, 68, 105, 97, 103, 110, 111, 115, 116, 105, 99, 32, 79, 110, 32, 68, 97, 107, 111, 32, 79, 109, 110, 105, 115, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Enters Into Amendment No. 3 To Credit Agreement\n", + "Agilent Technologies Enters Into Amendment No. 3 To Credit Agreement<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 69, 110, 116, 101, 114, 115, 32, 73, 110, 116, 111, 32, 65, 109, 101, 110, 100, 109, 101, 110, 116, 32, 78, 111, 46, 32, 51, 32, 84, 111, 32, 67, 114, 101, 100, 105, 116, 32, 65, 103, 114, 101, 101, 109, 101, 110, 116, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent withdraws Q2, FY guidance, cites coronavirus impact\n", + "Agilent withdraws Q2, FY guidance, cites coronavirus impact<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 119, 105, 116, 104, 100, 114, 97, 119, 115, 32, 81, 50, 44, 32, 70, 89, 32, 103, 117, 105, 100, 97, 110, 99, 101, 44, 32, 99, 105, 116, 101, 115, 32, 99, 111, 114, 111, 110, 97, 118, 105, 114, 117, 115, 32, 105, 109, 112, 97, 99, 116, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Reports Q1 Non-GAAP Earnings Per Share $0.81\n", + "Agilent Technologies Reports Q1 Non-GAAP Earnings Per Share $0.81<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 82, 101, 112, 111, 114, 116, 115, 32, 81, 49, 32, 78, 111, 110, 45, 71, 65, 65, 80, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 80, 101, 114, 32, 83, 104, 97, 114, 101, 32, 36, 48, 46, 56, 49, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Pershing Square Capital Management Lp Raises Sole Share Stake In Agilent Technologies\n", + "Pershing Square Capital Management Lp Raises Sole Share Stake In Agilent Technologies<\\s>\n", + "[60, 115, 62, 80, 101, 114, 115, 104, 105, 110, 103, 32, 83, 113, 117, 97, 114, 101, 32, 67, 97, 112, 105, 116, 97, 108, 32, 77, 97, 110, 97, 103, 101, 109, 101, 110, 116, 32, 76, 112, 32, 82, 97, 105, 115, 101, 115, 32, 83, 111, 108, 101, 32, 83, 104, 97, 114, 101, 32, 83, 116, 97, 107, 101, 32, 73, 110, 32, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Announces Twist Bioscience Agrees To Pay $22.5 Million Settlement In Ip Litigation\n", + "Agilent Technologies Announces Twist Bioscience Agrees To Pay $22.5 Million Settlement In Ip Litigation<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 65, 110, 110, 111, 117, 110, 99, 101, 115, 32, 84, 119, 105, 115, 116, 32, 66, 105, 111, 115, 99, 105, 101, 110, 99, 101, 32, 65, 103, 114, 101, 101, 115, 32, 84, 111, 32, 80, 97, 121, 32, 36, 50, 50, 46, 53, 32, 77, 105, 108, 108, 105, 111, 110, 32, 83, 101, 116, 116, 108, 101, 109, 101, 110, 116, 32, 73, 110, 32, 73, 112, 32, 76, 105, 116, 105, 103, 97, 116, 105, 111, 110, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Says CEO's FY 2019 Total Compensation Was $12.52 Mln Vs $11.7 Mln In FY 2018\n", + "Agilent Technologies Says CEO's FY 2019 Total Compensation Was $12.52 Mln Vs $11.7 Mln In FY 2018<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 83, 97, 121, 115, 32, 67, 69, 79, 39, 115, 32, 70, 89, 32, 50, 48, 49, 57, 32, 84, 111, 116, 97, 108, 32, 67, 111, 109, 112, 101, 110, 115, 97, 116, 105, 111, 110, 32, 87, 97, 115, 32, 36, 49, 50, 46, 53, 50, 32, 77, 108, 110, 32, 86, 115, 32, 36, 49, 49, 46, 55, 32, 77, 108, 110, 32, 73, 110, 32, 70, 89, 32, 50, 48, 49, 56, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Pershing Square Takes Sole Share Stake In Agilent Technologies\n", + "Pershing Square Takes Sole Share Stake In Agilent Technologies<\\s>\n", + "[60, 115, 62, 80, 101, 114, 115, 104, 105, 110, 103, 32, 83, 113, 117, 97, 114, 101, 32, 84, 97, 107, 101, 115, 32, 83, 111, 108, 101, 32, 83, 104, 97, 114, 101, 32, 83, 116, 97, 107, 101, 32, 73, 110, 32, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Akoya Biosciences, Inc. announced that it has received $50 million in funding from Telegraph Hill Partners, Innovatus Capital Partners, LLC, PJC Capital Management LLC, Agilent Technologies, Inc.\n", + "Akoya Biosciences, Inc. announced that it has received $50 million in funding from Telegraph Hill Partners, Innovatus Capital Partners, LLC, PJC Capital Management LLC, Agilent Technologies, Inc.<\\s>\n", + "[60, 115, 62, 65, 107, 111, 121, 97, 32, 66, 105, 111, 115, 99, 105, 101, 110, 99, 101, 115, 44, 32, 73, 110, 99, 46, 32, 97, 110, 110, 111, 117, 110, 99, 101, 100, 32, 116, 104, 97, 116, 32, 105, 116, 32, 104, 97, 115, 32, 114, 101, 99, 101, 105, 118, 101, 100, 32, 36, 53, 48, 32, 109, 105, 108, 108, 105, 111, 110, 32, 105, 110, 32, 102, 117, 110, 100, 105, 110, 103, 32, 102, 114, 111, 109, 32, 84, 101, 108, 101, 103, 114, 97, 112, 104, 32, 72, 105, 108, 108, 32, 80, 97, 114, 116, 110, 101, 114, 115, 44, 32, 73, 110, 110, 111, 118, 97, 116, 117, 115, 32, 67, 97, 112, 105, 116, 97, 108, 32, 80, 97, 114, 116, 110, 101, 114, 115, 44, 32, 76, 76, 67, 44, 32, 80, 74, 67, 32, 67, 97, 112, 105, 116, 97, 108, 32, 77, 97, 110, 97, 103, 101, 109, 101, 110, 116, 32, 76, 76, 67, 44, 32, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 44, 32, 73, 110, 99, 46, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies, Inc. Presents at 2019 HR Life Sciences Conference, Dec-04-2019\n", + "Agilent Technologies, Inc. Presents at 2019 HR Life Sciences Conference, Dec-04-2019<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 44, 32, 73, 110, 99, 46, 32, 80, 114, 101, 115, 101, 110, 116, 115, 32, 97, 116, 32, 50, 48, 49, 57, 32, 72, 82, 32, 76, 105, 102, 101, 32, 83, 99, 105, 101, 110, 99, 101, 115, 32, 67, 111, 110, 102, 101, 114, 101, 110, 99, 101, 44, 32, 68, 101, 99, 45, 48, 52, 45, 50, 48, 49, 57, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies, Inc. Reports Earnings Results for the Fourth Quarter Ended October 31, 2019\n", + "Agilent Technologies, Inc. Reports Earnings Results for the Fourth Quarter Ended October 31, 2019<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 44, 32, 73, 110, 99, 46, 32, 82, 101, 112, 111, 114, 116, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 82, 101, 115, 117, 108, 116, 115, 32, 102, 111, 114, 32, 116, 104, 101, 32, 70, 111, 117, 114, 116, 104, 32, 81, 117, 97, 114, 116, 101, 114, 32, 69, 110, 100, 101, 100, 32, 79, 99, 116, 111, 98, 101, 114, 32, 51, 49, 44, 32, 50, 48, 49, 57, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Inc. Provides Earnings Guidance for the First Quarter and Full Fiscal Year of 2020\n", + "Agilent Technologies Inc. Provides Earnings Guidance for the First Quarter and Full Fiscal Year of 2020<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 73, 110, 99, 46, 32, 80, 114, 111, 118, 105, 100, 101, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 71, 117, 105, 100, 97, 110, 99, 101, 32, 102, 111, 114, 32, 116, 104, 101, 32, 70, 105, 114, 115, 116, 32, 81, 117, 97, 114, 116, 101, 114, 32, 97, 110, 100, 32, 70, 117, 108, 108, 32, 70, 105, 115, 99, 97, 108, 32, 89, 101, 97, 114, 32, 111, 102, 32, 50, 48, 50, 48, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Companion Diagnostic Expands CE-IVD mark in Europe to include Head and Neck Squamous Cell Carcinoma (HNSCC)\n", + "Agilent Companion Diagnostic Expands CE-IVD mark in Europe to include Head and Neck Squamous Cell Carcinoma (HNSCC)<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 67, 111, 109, 112, 97, 110, 105, 111, 110, 32, 68, 105, 97, 103, 110, 111, 115, 116, 105, 99, 32, 69, 120, 112, 97, 110, 100, 115, 32, 67, 69, 45, 73, 86, 68, 32, 109, 97, 114, 107, 32, 105, 110, 32, 69, 117, 114, 111, 112, 101, 32, 116, 111, 32, 105, 110, 99, 108, 117, 100, 101, 32, 72, 101, 97, 100, 32, 97, 110, 100, 32, 78, 101, 99, 107, 32, 83, 113, 117, 97, 109, 111, 117, 115, 32, 67, 101, 108, 108, 32, 67, 97, 114, 99, 105, 110, 111, 109, 97, 32, 40, 72, 78, 83, 67, 67, 41, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies Increases Cash Dividend To 18 Cents Per Share\n", + "Agilent Technologies Increases Cash Dividend To 18 Cents Per Share<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 73, 110, 99, 114, 101, 97, 115, 101, 115, 32, 67, 97, 115, 104, 32, 68, 105, 118, 105, 100, 101, 110, 100, 32, 84, 111, 32, 49, 56, 32, 67, 101, 110, 116, 115, 32, 80, 101, 114, 32, 83, 104, 97, 114, 101, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies, Inc. Announces Quarterly Dividend, Payable on Jan. 22, 2020\n", + "Agilent Technologies, Inc. Announces Quarterly Dividend, Payable on Jan. 22, 2020<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 44, 32, 73, 110, 99, 46, 32, 65, 110, 110, 111, 117, 110, 99, 101, 115, 32, 81, 117, 97, 114, 116, 101, 114, 108, 121, 32, 68, 105, 118, 105, 100, 101, 110, 100, 44, 32, 80, 97, 121, 97, 98, 108, 101, 32, 111, 110, 32, 74, 97, 110, 46, 32, 50, 50, 44, 32, 50, 48, 50, 48, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n", + "Agilent Technologies and Guangzhou Burning Rock Medical Laboratory Co., Ltd. Launch Magnis BR to Innovate Total Process Automation for NGS Testing\n", + "Agilent Technologies and Guangzhou Burning Rock Medical Laboratory Co., Ltd. Launch Magnis BR to Innovate Total Process Automation for NGS Testing<\\s>\n", + "[60, 115, 62, 65, 103, 105, 108, 101, 110, 116, 32, 84, 101, 99, 104, 110, 111, 108, 111, 103, 105, 101, 115, 32, 97, 110, 100, 32, 71, 117, 97, 110, 103, 122, 104, 111, 117, 32, 66, 117, 114, 110, 105, 110, 103, 32, 82, 111, 99, 107, 32, 77, 101, 100, 105, 99, 97, 108, 32, 76, 97, 98, 111, 114, 97, 116, 111, 114, 121, 32, 67, 111, 46, 44, 32, 76, 116, 100, 46, 32, 76, 97, 117, 110, 99, 104, 32, 77, 97, 103, 110, 105, 115, 32, 66, 82, 32, 116, 111, 32, 73, 110, 110, 111, 118, 97, 116, 101, 32, 84, 111, 116, 97, 108, 32, 80, 114, 111, 99, 101, 115, 115, 32, 65, 117, 116, 111, 109, 97, 116, 105, 111, 110, 32, 102, 111, 114, 32, 78, 71, 83, 32, 84, 101, 115, 116, 105, 110, 103, 60, 92, 115, 62]\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for i in range(0,21):\n", + " print(data.headline[i])\n", + " print(text[i])\n", + " print(b_text[i])\n", + " print('\\n'*3)" + ] + }, + { + "cell_type": "code", + "execution_count": 1113, + "metadata": {}, + "outputs": [], + "source": [ + "def split_X_y(text):\n", + " X = []\n", + " y = []\n", + " for i in text:\n", + " X.append(i[0:-1])\n", + " y.append(i[1:])\n", + " return X,y" + ] + }, + { + "cell_type": "code", + "execution_count": 1150, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = split_X_y(b_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 1118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[60, 115, 62, 86, 101, 114, 105, 122, 111, 110, 32, 67, 111, 109, 109, 117, 110, 105, 99, 97, 116, 105, 111, 110, 115, 32, 73, 110, 99, 46, 32, 80, 114, 111, 118, 105, 100, 101, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 71, 117, 105, 100, 97, 110, 99, 101, 32, 102, 111, 114, 32, 116, 104, 101, 32, 83, 101, 99, 111, 110, 100, 32, 81, 117, 97, 114, 116, 101, 114, 32, 111, 102, 32, 50, 48, 49, 57, 60, 92, 115]\n", + "Verizon Communications Inc. Provides Earnings Guidance for the Second Quarter of 2019<\\s>\n", + "[115, 62, 86, 101, 114, 105, 122, 111, 110, 32, 67, 111, 109, 109, 117, 110, 105, 99, 97, 116, 105, 111, 110, 115, 32, 73, 110, 99, 46, 32, 80, 114, 111, 118, 105, 100, 101, 115, 32, 69, 97, 114, 110, 105, 110, 103, 115, 32, 71, 117, 105, 100, 97, 110, 99, 101, 32, 102, 111, 114, 32, 116, 104, 101, 32, 83, 101, 99, 111, 110, 100, 32, 81, 117, 97, 114, 116, 101, 114, 32, 111, 102, 32, 50, 48, 49, 57, 60, 92, 115, 62]\n" + ] + } + ], + "source": [ + "#Each sentence is represented as the concatenation of bytes that form its characters in utf-8 encoding.\n", + "num = np.random.randint(0, len(X))\n", + "print(X[num])\n", + "print(text[num])\n", + "print(y[num])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ###########OLD#########################" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check UTF-8 Encoding \n", + "byte_text = []\n", + "for i in text:\n", + " i = i.encode('utf-8')\n", + " byte_text.append(i)\n", + "byte_text[0:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def encode2bytes(text):\n", + " final_list = []\n", + " for sent in text:\n", + " sent = sent.encode('utf-8')\n", + " temp_list = []\n", + " for char in sent:\n", + " temp_list.append(char)\n", + " final_list.append(temp_list)\n", + " return final_list" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "char2ind = {char: index +1 for index, char in enumerate(chars)} #index +1 to leave 0 for padding\n", + "max_sentence_len = max([len(sentence) for sentence in text])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "from keras.preprocessing.text import Tokenizer\n", + "# Initialization\n", + "tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK', lower = False)\n", + "# Fitting\n", + "tk.fit_on_texts(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# Use char_dict to replace the tk.word_index\n", + "tk.word_index = char2ind \n", + "# Add 'UNK' to the vocabulary \n", + "tk.word_index[tk.oov_token] = max(char2ind.values()) + 1\n", + "\n", + "# invert encoding\n", + "#index2char = {char: index for index, char in tk.word_index.items()}\n", + "chars.append(tk.oov_token)\n", + "chars.insert(0,'')\n", + "index2char = np.array(chars)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{' ': 1, '.': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12, '<': 13, '>': 14, 'A': 15, 'B': 16, 'C': 17, 'D': 18, 'E': 19, 'F': 20, 'G': 21, 'H': 22, 'I': 23, 'J': 24, 'K': 25, 'L': 26, 'M': 27, 'N': 28, 'O': 29, 'P': 30, 'Q': 31, 'R': 32, 'S': 33, 'T': 34, 'U': 35, 'V': 36, 'W': 37, 'X': 38, 'Y': 39, 'Z': 40, '\\\\': 41, 'a': 42, 'b': 43, 'c': 44, 'd': 45, 'e': 46, 'f': 47, 'g': 48, 'h': 49, 'i': 50, 'j': 51, 'k': 52, 'l': 53, 'm': 54, 'n': 55, 'o': 56, 'p': 57, 'q': 58, 'r': 59, 's': 60, 't': 61, 'u': 62, 'v': 63, 'w': 64, 'x': 65, 'y': 66, 'z': 67, 'µ': 68, 'Æ': 69, 'É': 70, 'Ø': 71, 'à': 72, 'á': 73, 'ä': 74, 'å': 75, 'é': 76, 'ê': 77, 'ë': 78, 'í': 79, 'ï': 80, 'ñ': 81, 'ó': 82, 'ö': 83, 'ú': 84, 'ü': 85, 'ē': 86, 'Š': 87, 'UNK': 88}\n", + "\n", + "['' ' ' '.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '<' '>' 'A' 'B' 'C'\n", + " 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U'\n", + " 'V' 'W' 'X' 'Y' 'Z' '\\\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l'\n", + " 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' 'µ' 'Æ' 'É' 'Ø'\n", + " 'à' 'á' 'ä' 'å' 'é' 'ê' 'ë' 'í' 'ï' 'ñ' 'ó' 'ö' 'ú' 'ü' 'ē' 'Š' 'UNK']\n" + ] + } + ], + "source": [ + "print(tk.word_index)\n", + "print()\n", + "print(index2char)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "88\n", + "88\n", + "88\n" + ] + } + ], + "source": [ + "print(max(char2ind.values())),print(max(tk.word_index.values())), print(len(tk.word_index));" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "sequences = list()\n", + "for line in text:\n", + " # integer encode line\n", + " encoded_seq = np.array([tk.word_index[char] for char in line])\n", + " #encoded_seq = np.array([ord(char) for char in line])\n", + "\n", + " # store\n", + " sequences.append(encoded_seq)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Agilent Technologies Announces Pricing Of 500 Million Of Senior Notes<\\s>\n", + "[13, 4, 14, 23, 22, 10, 11, 3, 5, 9, 2, 34, 3, 12, 17, 5, 6, 11, 6, 22, 10, 3, 4, 2, 23, 5, 5, 6, 16, 5, 12, 3, 4, 2, 29, 7, 10, 12, 10, 5, 22, 2, 36, 26, 2, 54, 25, 25, 2, 38, 10, 11, 11, 10, 6, 5, 2, 36, 26, 2, 20, 3, 5, 10, 6, 7, 2, 43, 6, 9, 3, 4, 13, 24, 4, 14]\n" + ] + } + ], + "source": [ + "# tranform text characters to unique index\n", + "sequences = tk.texts_to_sequences(text)\n", + "print(text[0])\n", + "print(sequences[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "X = []\n", + "y = []\n", + "for i in sequences:\n", + " X.append(i[0:-1])\n", + " y.append(i[1:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ###########OLD#########################" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Masking is a way to tell sequence-processing layers that certain timesteps in an input are missing, and thus should be skipped when processing the data.\n", + "\n", + "Padding is a special form of masking where the masked steps are at the start or at the beginning of a sequence. Padding comes from the need to encode sequence data into contiguous batches: in order to make all sequences in a batch fit a given standard length, it is necessary to pad or truncate some sequences.\n", + "https://stackoverflow.com/questions/53172852/masking-zero-inputs-in-lstm-in-keras-without-using-embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 1153, + "metadata": {}, + "outputs": [], + "source": [ + "X = pad_sequences(X, maxlen = max_sentence_len, padding = 'post')\n", + "y = pad_sequences(y, maxlen = max_sentence_len, padding = 'post')" + ] + }, + { + "cell_type": "code", + "execution_count": 1121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verizon Communications Inc. Provides Earnings Guidance for the Second Quarter of 2019<\\s>\n", + "[ 60 115 62 86 101 114 105 122 111 110 32 67 111 109 109 117 110 105\n", + " 99 97 116 105 111 110 115 32 73 110 99 46 32 80 114 111 118 105\n", + " 100 101 115 32 69 97 114 110 105 110 103 115 32 71 117 105 100 97\n", + " 110 99 101 32 102 111 114 32 116 104 101 32 83 101 99 111 110 100\n", + " 32 81 117 97 114 116 101 114 32 111 102 32 50 48 49 57 60 92\n", + " 115 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "[115 62 86 101 114 105 122 111 110 32 67 111 109 109 117 110 105 99\n", + " 97 116 105 111 110 115 32 73 110 99 46 32 80 114 111 118 105 100\n", + " 101 115 32 69 97 114 110 105 110 103 115 32 71 117 105 100 97 110\n", + " 99 101 32 102 111 114 32 116 104 101 32 83 101 99 111 110 100 32\n", + " 81 117 97 114 116 101 114 32 111 102 32 50 48 49 57 60 92 115\n", + " 62 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "# Add padding for longer sentences\n", + "print(text[num])\n", + "print(X[num])\n", + "print(y[num])" + ] + }, + { + "cell_type": "code", + "execution_count": 1122, + "metadata": {}, + "outputs": [], + "source": [ + "#split input / output\n", + "# X ,y = X_char[:,:-1], X_char[:,1:]\n", + "# print('X: ' , X[0], len(X[0]) ,'\\n', 'y: ', y[0], len(y[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 1123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((17032, 412), (17032, 412))" + ] + }, + "execution_count": 1123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape, y.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Another Option of Splitting Sequence\n", + "*But misses input/output since we had to pad before splitting*" + ] + }, + { + "cell_type": "code", + "execution_count": 822, + "metadata": {}, + "outputs": [], + "source": [ + "padded_seq = pad_sequences(b_text, maxlen = max_sentence_len, padding = 'post')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the `tf.data.Dataset.from_tensor_slices` function to convert a text vector into a stream of character indices." + ] + }, + { + "cell_type": "code", + "execution_count": 823, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent Technologies Q2 GAAP Earnings Per Share $0.32<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent PD-L1 Assay Receives FDA Approval For Use As Companion Diagnostic<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "FDA Approves Nivolumab Plus Ipilimumab For First-Line Metastatic Non-Small Cell Lung Cancer<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent Gets FDA Approval For Pd-L1 Companion Diagnostic On Dako Omnis<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent Technologies Enters Into Amendment No. 3 To Credit Agreement<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent withdraws Q2 FY guidance cites coronavirus impact<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Agilent Technologies Reports Q1 Non-GAAP Earnings Per Share $0.81<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "Pershing Square Capital Management Lp Raises Sole Share Stake In Agilent Technologies<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n" + ] + } + ], + "source": [ + "# Create Training Sequences\n", + "char_dataset = tf.data.Dataset.from_tensor_slices(padded_seq)\n", + "print(char_dataset)\n", + "for i in char_dataset.take(10):\n", + " print(\"\".join(map(chr, i)))" + ] + }, + { + "cell_type": "code", + "execution_count": 824, + "metadata": {}, + "outputs": [], + "source": [ + "def create_seq_targets(seq):\n", + " input_txt = seq[:-1]\n", + " target_txt = seq[1:]\n", + " return input_txt, target_txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___________________________________________________________________________________________________________________________________" + ] + }, + { + "cell_type": "code", + "execution_count": 1335, + "metadata": {}, + "outputs": [], + "source": [ + "# seq_data = char_dataset.map(create_seq_targets)\n", + "# seq_data\n", + "train_size = len(text) * 50//100\n", + "train_seq_data = tf.data.Dataset.from_tensor_slices((X[:train_size],y[:train_size]))\n", + "test_seq_data = tf.data.Dataset.from_tensor_slices((X[train_size:],y[train_size:]))" + ] + }, + { + "cell_type": "code", + "execution_count": 1336, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------Headline--------------------------------\n", + "[ 60 115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108\n", + " 111 103 105 101 115 32 65 110 110 111 117 110 99 101 115 32 80 114\n", + " 105 99 105 110 103 32 79 102 32 36 53 48 48 32 77 105 108 108\n", + " 105 111 110 32 79 102 32 83 101 110 105 111 114 32 78 111 116 101\n", + " 115 60 92 115 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "\n", + "\n", + "[115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108 111\n", + " 103 105 101 115 32 65 110 110 111 117 110 99 101 115 32 80 114 105\n", + " 99 105 110 103 32 79 102 32 36 53 48 48 32 77 105 108 108 105\n", + " 111 110 32 79 102 32 83 101 110 105 111 114 32 78 111 116 101 115\n", + " 60 92 115 62 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "s>Agilent Technologies Announces Pricing Of $500 Million Of Senior Notes<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "--------------------------------Headline--------------------------------\n", + "[ 60 115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108\n", + " 111 103 105 101 115 32 70 105 108 101 115 32 70 111 114 32 80 111\n", + " 116 101 110 116 105 97 108 32 83 101 110 105 111 114 32 78 111 116\n", + " 101 115 32 79 102 102 101 114 105 110 103 32 83 105 122 101 32 78\n", + " 111 116 32 68 105 115 99 108 111 115 101 100 60 92 115 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "\n", + "\n", + "[115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108 111\n", + " 103 105 101 115 32 70 105 108 101 115 32 70 111 114 32 80 111 116\n", + " 101 110 116 105 97 108 32 83 101 110 105 111 114 32 78 111 116 101\n", + " 115 32 79 102 102 101 114 105 110 103 32 83 105 122 101 32 78 111\n", + " 116 32 68 105 115 99 108 111 115 101 100 60 92 115 62 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "s>Agilent Technologies Files For Potential Senior Notes Offering Size Not Disclosed<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "--------------------------------Headline--------------------------------\n", + "[ 60 115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108\n", + " 111 103 105 101 115 32 81 50 32 71 65 65 80 32 69 97 114 110\n", + " 105 110 103 115 32 80 101 114 32 83 104 97 114 101 32 36 48 46\n", + " 51 50 60 92 115 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "Agilent Technologies Q2 GAAP Earnings Per Share $0.32<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "\n", + "\n", + "[115 62 65 103 105 108 101 110 116 32 84 101 99 104 110 111 108 111\n", + " 103 105 101 115 32 81 50 32 71 65 65 80 32 69 97 114 110 105\n", + " 110 103 115 32 80 101 114 32 83 104 97 114 101 32 36 48 46 51\n", + " 50 60 92 115 62 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "s>Agilent Technologies Q2 GAAP Earnings Per Share $0.32<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "--------------------------------Headline--------------------------------\n", + "[ 60 115 62 65 103 105 108 101 110 116 32 80 68 45 76 49 32 65\n", + " 115 115 97 121 32 82 101 99 101 105 118 101 115 32 70 68 65 32\n", + " 65 112 112 114 111 118 97 108 32 70 111 114 32 85 115 101 32 65\n", + " 115 32 67 111 109 112 97 110 105 111 110 32 68 105 97 103 110 111\n", + " 115 116 105 99 60 92 115 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "Agilent PD-L1 Assay Receives FDA Approval For Use As Companion Diagnostic<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "\n", + "\n", + "[115 62 65 103 105 108 101 110 116 32 80 68 45 76 49 32 65 115\n", + " 115 97 121 32 82 101 99 101 105 118 101 115 32 70 68 65 32 65\n", + " 112 112 114 111 118 97 108 32 70 111 114 32 85 115 101 32 65 115\n", + " 32 67 111 109 112 97 110 105 111 110 32 68 105 97 103 110 111 115\n", + " 116 105 99 60 92 115 62 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "s>Agilent PD-L1 Assay Receives FDA Approval For Use As Companion Diagnostic<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "--------------------------------Headline--------------------------------\n", + "[ 60 115 62 70 68 65 32 65 112 112 114 111 118 101 115 32 78 105\n", + " 118 111 108 117 109 97 98 32 80 108 117 115 32 73 112 105 108 105\n", + " 109 117 109 97 98 32 70 111 114 32 70 105 114 115 116 45 76 105\n", + " 110 101 32 77 101 116 97 115 116 97 116 105 99 32 78 111 110 45\n", + " 83 109 97 108 108 32 67 101 108 108 32 76 117 110 103 32 67 97\n", + " 110 99 101 114 60 92 115 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "FDA Approves Nivolumab Plus Ipilimumab For First-Line Metastatic Non-Small Cell Lung Cancer<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "\n", + "\n", + "[115 62 70 68 65 32 65 112 112 114 111 118 101 115 32 78 105 118\n", + " 111 108 117 109 97 98 32 80 108 117 115 32 73 112 105 108 105 109\n", + " 117 109 97 98 32 70 111 114 32 70 105 114 115 116 45 76 105 110\n", + " 101 32 77 101 116 97 115 116 97 116 105 99 32 78 111 110 45 83\n", + " 109 97 108 108 32 67 101 108 108 32 76 117 110 103 32 67 97 110\n", + " 99 101 114 60 92 115 62 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0]\n", + "s>FDA Approves Nivolumab Plus Ipilimumab For First-Line Metastatic Non-Small Cell Lung Cancer<\\s>\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n" + ] + } + ], + "source": [ + "#Checking OR next(itr(...))\n", + "for input_txt, target_txt in seq_data.take(5):\n", + " print('--------------------------------Headline--------------------------------')\n", + " print(input_txt.numpy())\n", + " print(\"\".join(map(chr, input_txt.numpy())))\n", + "# print(''.join(index2char[input_txt.numpy()]))\n", + " print('\\n')\n", + " print(target_txt.numpy())\n", + " print(\"\".join(map(chr, target_txt.numpy())))\n", + " # There is an extra whitespace!\n", + "# print(''.join(index2char[target_txt.numpy()]))" + ] + }, + { + "cell_type": "code", + "execution_count": 1356, + "metadata": {}, + "outputs": [], + "source": [ + "# Batch size\n", + "batch_size = 128\n", + "\n", + "# Buffer size to shuffle the dataset so it doesn't attempt to shuffle\n", + "# the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements\n", + "buffer_size = 10000\n", + "\n", + "dataset = train_seq_data.shuffle(buffer_size, seed = 42).batch(batch_size, drop_remainder=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 1357, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1357, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1358, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1358, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_example_batch, train_label_batch = next(iter(dataset.batch(1)))\n", + "train_example_batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Language Model" + ] + }, + { + "cell_type": "code", + "execution_count": 1359, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras.losses import sparse_categorical_crossentropy\n", + "#https://datascience.stackexchange.com/questions/41921/sparse-categorical-crossentropy-vs-categorical-crossentropy-keras-accuracy\n", + "def sparse_cat_loss(y_true,y_pred):\n", + " return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tips for LSTM Inputs \n", + "- The LSTM input layer must be 3D.\n", + "- The meaning of the 3 input dimensions are: samples, time steps, and features (sequences, sequence_length, characters).\n", + "- The LSTM input layer is defined by the input_shape argument on the first hidden layer.\n", + "- The input_shape argument takes a tuple of two values that define the number of time steps and features.\n", + "- The number of samples is assumed to be 1 or more.\n", + "- The reshape() function on NumPy arrays can be used to reshape your 1D or 2D data to be 3D.\n", + "- The reshape() function takes a tuple as an argument that defines the new shape\n", + "- The LSTM return the entire sequence of outputs for each sample (one vector per timestep per sample), if you set return_sequences=True.\n", + "- Stateful RNN only makes sense if each input sequence in a batch starts exactly where the corresponding sequence in the previous batch left off. Our RNN model is stateless since each sample is different from the other and they dont form a text corpus but are separate headlines.\n", + "\n", + "#### Tips for Embedding Layer\n", + "- Gives relationship between characters.\n", + "- Dense vector representation (n-Dimensional) of float point values. Map(char/byte) to a dense vector.\n", + "- Embeddings are trainable weights/paramaeters by the model equivalent to weights learned by dense layer.\n", + "- In our case each unique character/byte is represented with an N-Dimensional vector of floating point values, where the learned embedding forms a lookup table by \"looking up\" each characters dense vector in the table to encode it.\n", + "- A simple integer encoding of our characters is not efficient for the model to interpret since a linear classifier only learns the weights for a single feature but not the relationship (probability distribution) between each feature(characters) or there encodings.\n", + "- A higher dimensional embedding can capture fine-grained relationships between characters, but takes more data to learn.(256-Dimensions our case)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1360, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# stateful=True: preserve hidden(final) state for next batch's intial state\n", + "def create_model(batch_size):\n", + " model = Sequential(name = 'CharLSTM')\n", + " model.add(Embedding(127, 256,batch_input_shape=[batch_size, None], mask_zero=True, name ='EmbedLayer'))\n", + " model.add(Bidirectional(LSTM(1024, return_sequences=True,stateful=False,recurrent_initializer='glorot_uniform'), name = 'BiLSTM'))\n", + " model.add(TimeDistributed(Dense(127, name = 'TimeDistDense')))\n", + " model.compile(optimizer=tf.optimizers.SGD(learning_rate=1e-3), loss = tf.losses.SparseCategoricalCrossentropy(from_logits = True))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 1361, + "metadata": {}, + "outputs": [], + "source": [ + "model = create_model(batch_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 1362, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"CharLSTM\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "EmbedLayer (Embedding) (128, None, 256) 32512 \n", + "_________________________________________________________________\n", + "BiLSTM (Bidirectional) (128, None, 2048) 10493952 \n", + "_________________________________________________________________\n", + "time_distributed_15 (TimeDis (None, None, 127) 260223 \n", + "=================================================================\n", + "Total params: 10,786,687\n", + "Trainable params: 10,786,687\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1363, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(\n", + "[ 60 115 62 75 101 115 115 108 101 114 32 84 111 112 97 122 32 77\n", + " 101 108 116 122 101 114 32 38 32 67 104 101 99 107 44 32 76 76\n", + " 80 32 70 105 108 101 115 32 83 104 97 114 101 104 111 108 100 101\n", + " 114 32 67 108 97 115 115 32 65 99 116 105 111 110 32 76 97 119\n", + " 115 117 105 116 32 65 103 97 105 110 115 116 32 66 97 120 116 101\n", + " 114 32 73 110 116 101 114 110 97 116 105 111 110 97 108 32 73 110\n", + " 99 60 92 115 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(412,), dtype=int32) \n", + "\n", + "tf.Tensor(\n", + "[115 62 75 101 115 115 108 101 114 32 84 111 112 97 122 32 77 101\n", + " 108 116 122 101 114 32 38 32 67 104 101 99 107 44 32 76 76 80\n", + " 32 70 105 108 101 115 32 83 104 97 114 101 104 111 108 100 101 114\n", + " 32 67 108 97 115 115 32 65 99 116 105 111 110 32 76 97 119 115\n", + " 117 105 116 32 65 103 97 105 110 115 116 32 66 97 120 116 101 114\n", + " 32 73 110 116 101 114 110 97 116 105 111 110 97 108 32 73 110 99\n", + " 60 92 115 62 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(412,), dtype=int32)\n" + ] + } + ], + "source": [ + "for input_example_batch, target_example_batch in dataset.take(1):\n", + " print(input_example_batch[0], '\\n')\n", + " print(target_example_batch[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1371, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 412, 127) <=== (batch_size, sequence_length, byte/character)\n" + ] + } + ], + "source": [ + "for input_example_batch, target_example_batch in dataset.take(1):\n", + "\n", + " # Predict off some random batch\n", + " example_batch_predictions = model(input_example_batch)\n", + "\n", + " # Display the dimensions of the predictions\n", + " print(example_batch_predictions.shape, \" <=== (batch_size, sequence_length, byte/character)\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Logits Predicting Log-Likelihood from Ouput Layer:\n", + "For each character/byte the model looks up the embedding, runs the LSTM one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character/Byte.\n", + "This distribution, for each predicted character/byte, is defined by the logits over the characters(i.e 1-127 Decimal Points bytes)." + ] + }, + { + "cell_type": "code", + "execution_count": 1374, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1374, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sum(np.where(example_batch_predictions[0][1] < 0, example_batch_predictions[0][1] * -1, example_batch_predictions[0][1])\n", + "example_batch_predictions[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 1377, + "metadata": {}, + "outputs": [], + "source": [ + "sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1378, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1378, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sampled_indices" + ] + }, + { + "cell_type": "code", + "execution_count": 1379, + "metadata": {}, + "outputs": [], + "source": [ + "# Reformat to not be a lists of lists\n", + "sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1380, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\x12'" + ] + }, + "execution_count": 1380, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chr(18)" + ] + }, + { + "cell_type": "code", + "execution_count": 1381, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Given the input seq: \n", + "\n", + "NextEra Energy Partners, LP (NYSE:NEP) completed the acquisition of 39.2% stake in Central Penn Line from Cabot Oil & Gas Corporation (NYSE:COG), WGL Midstream, Inc. and EIF Vega Midstream, LLC.<\\s\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\n", + "412\n", + "\n", + "\n", + "Chars Predictions: \n", + "\n", + "|d?F7;L,5]K+M?5\u001bLS\u000e[e#\u000f6Be*-'$no+d\u0011u1`Z^\u0014W\u0007Y,\u0010 I\u001f?\u001e", + "T\f", + "\f", + "O|j\u0003M\u0002\\Y\u001d", + "}U5\u001d", + "\"\u0006\u0016K)5{idbEzsvm\t2[ma4V\u001f\u0019,s\u0003i{\u0012yJA\"2G\u001ak%1\f", + "O9?ZS?\u001bD\u001c", + "J<\u000b", + "xXjdeY~\u0014\u0019V\"@w^aRVd\u0002'\u001b%2tz.m{=MU\u0011Y\u000b", + "@4%K\n", + "\u0013rc\u000eR}U`7\u0016Wv?tS\u0012o\u0002?HAc2h#\u001d", + "\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# https://www.kdnuggets.com/2019/03/train-keras-model-20x-faster-tpu-free.html\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mepochs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mhistory\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mepochs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0muse_multiprocessing\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mworkers\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcheckpoint_callback\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\keras\\engine\\training.py\u001b[0m in \u001b[0;36m_method_wrapper\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_method_wrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_in_multi_worker_mode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# pylint: disable=protected-access\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 66\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 67\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 68\u001b[0m \u001b[1;31m# Running inside `run_distribute_coordinator` already.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m 846\u001b[0m batch_size=batch_size):\n\u001b[0;32m 847\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_train_batch_begin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 848\u001b[1;33m \u001b[0mtmp_logs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtrain_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 849\u001b[0m \u001b[1;31m# Catch OutOfRangeError for Datasets of unknown size.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 850\u001b[0m \u001b[1;31m# This blocks until the batch has finished executing.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m 578\u001b[0m \u001b[0mxla_context\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mExit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 579\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 580\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 581\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 582\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m 609\u001b[0m \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 610\u001b[0m \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 611\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 612\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 613\u001b[0m \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 2418\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2419\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_maybe_define_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2420\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_filtered_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# pylint: disable=protected-access\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2421\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2422\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_filtered_call\u001b[1;34m(self, args, kwargs)\u001b[0m\n\u001b[0;32m 1663\u001b[0m if isinstance(t, (ops.Tensor,\n\u001b[0;32m 1664\u001b[0m resource_variable_ops.BaseResourceVariable))),\n\u001b[1;32m-> 1665\u001b[1;33m self.captured_inputs)\n\u001b[0m\u001b[0;32m 1666\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1667\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_call_flat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcaptured_inputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcancellation_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m 1744\u001b[0m \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1745\u001b[0m return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1746\u001b[1;33m ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m 1747\u001b[0m forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m 1748\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m 596\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 597\u001b[0m \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 598\u001b[1;33m ctx=ctx)\n\u001b[0m\u001b[0;32m 599\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 600\u001b[0m outputs = execute.execute_with_cancellation(\n", + "\u001b[1;32mc:\\users\\firo obeid\\anaconda3\\envs\\pushstashenv\\lib\\site-packages\\tensorflow\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m 58\u001b[0m \u001b[0mctx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mensure_initialized\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 59\u001b[0m tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,\n\u001b[1;32m---> 60\u001b[1;33m inputs, attrs, num_outputs)\n\u001b[0m\u001b[0;32m 61\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# https://www.kdnuggets.com/2019/03/train-keras-model-20x-faster-tpu-free.html\n", + "epochs = 10\n", + "history = model.fit(dataset, epochs=epochs, use_multiprocessing = True, workers=5, verbose = 2, callbacks=[checkpoint_callback])" + ] + }, + { + "cell_type": "code", + "execution_count": 1385, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'./training_checkpoints\\\\ckpt_3'" + ] + }, + "execution_count": 1385, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tf.train.latest_checkpoint(checkpoint_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 1390, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1390, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = create_model(batch_size=1)\n", + "\n", + "model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))" + ] + }, + { + "cell_type": "code", + "execution_count": 1391, + "metadata": {}, + "outputs": [], + "source": [ + "model.build(tf.TensorShape([1, None]))" + ] + }, + { + "cell_type": "code", + "execution_count": 1392, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"CharLSTM\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "EmbedLayer (Embedding) (1, None, 256) 32512 \n", + "_________________________________________________________________\n", + "BiLSTM (Bidirectional) (1, None, 2048) 10493952 \n", + "_________________________________________________________________\n", + "time_distributed_17 (TimeDis (None, None, 127) 260223 \n", + "=================================================================\n", + "Total params: 10,786,687\n", + "Trainable params: 10,786,687\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1406, + "metadata": {}, + "outputs": [], + "source": [ + "X_test, y_test = next(iter(test_seq_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 1422, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "55" + ] + }, + "execution_count": 1422, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(\"\".join(map(chr,X_test.numpy())).strip(chr(0)))" + ] + }, + { + "cell_type": "code", + "execution_count": 1594, + "metadata": {}, + "outputs": [], + "source": [ + "new = X[np.random.randint(0,len(data))]" + ] + }, + { + "cell_type": "code", + "execution_count": 1595, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F5 Networks, Inc. Provides Earnings Guidance for the First Quarter of Fiscal Year 2020 Ending December 31, 2019<\\s\n", + "117\n" + ] + } + ], + "source": [ + "print(\"\".join(map(chr, new)).strip(chr(0)))\n", + "print(len(\"\".join(map(chr, new)).strip(chr(0))))" + ] + }, + { + "cell_type": "code", + "execution_count": 1588, + "metadata": {}, + "outputs": [], + "source": [ + "prediction = model(new.reshape(1,-1))" + ] + }, + { + "cell_type": "code", + "execution_count": 1589, + "metadata": {}, + "outputs": [], + "source": [ + "# prediction.numpy()[:,54:,:] \n", + "prediction = prediction[-1,:,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 1591, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s~P'PPsjjjs!!s!!\\!!!!!\\!!!!!!!!!!!f!f!!!\u0010ooff!!s!!!!!f\n", + "o!!\u001e", + "FffffQM:fIfff!eseHU\u0010\u0010\u0010ssss.\u0010bD!!!!fffff!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", + "97\n" + ] + } + ], + "source": [ + "print(\"\".join(map(chr,np.argmax(prediction, axis = 1) + 1)))\n", + "print(len(\"s~P'PPsjjjs!!s!!\\!!!!!\\!!!!!!!!!!!f!f!!!\u0010ooff!!s!!!!!fo!!\u001e", + "FffffQM:fIfff!eseHU\u0010\u0010\u0010ssss.\u0010bD!!!!fffff\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 1529, + "metadata": {}, + "outputs": [], + "source": [ + "p_i = np.zeros((prediction.shape))\n", + "for i in range(0, 412):\n", + " p = np.exp(prediction[i])/np.sum(np.exp(prediction[i]))\n", + " p_i[i] = p" + ] + }, + { + "cell_type": "code", + "execution_count": 1530, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([125, 125, 70, 70, 101, 91, 32, 32, 118, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 114, 32, 32, 91, 101, 101, 32,\n", + " 114, 64, 101, 101, 101, 101, 15, 101, 15, 99, 110, 110, 97,\n", + " 101, 101, 101, 101, 109, 32, 32, 32, 32, 32, 7, 101, 32,\n", + " 32, 32, 100, 71, 101, 95, 32, 101, 32, 32, 32, 101, 32,\n", + " 32, 32, 32, 32, 32, 32, 101, 32, 37, 114, 101, 125, 125,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,\n", + " 32, 32, 32, 32, 32, 32, 32, 32, 32], dtype=int64)" + ] + }, + "execution_count": 1530, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.argmax(p_i, axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 1534, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'~~GGf\\\\!!w!!!!!!!!!!s!!\\\\ff!sAffff\\x10f\\x10doobffffn!!!!!\\x08f!!!eHf`!f!!!f!!!!!!!f!&sf~~!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'" + ] + }, + "execution_count": 1534, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\".join(map(chr,np.argmax(p_i, axis = 1) + 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 1463, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(412, 127)" + ] + }, + "execution_count": 1463, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 870, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "479/479 - 10507s - loss: 1.2081 - accuracy: 8.6585e-04\n" + ] + }, + { + "data": { + "text/plain": [ + "[1.2080851793289185, 0.0008658546721562743]" + ] + }, + "execution_count": 870, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.evaluate(dataset, verbose=2, use_multiprocessing=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 948, + "metadata": {}, + "outputs": [], + "source": [ + "model = create_model(batch_size=128)\n", + "\n", + "model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))\n", + "\n", + "model.build(tf.TensorShape([1, None]))" + ] + }, + { + "cell_type": "code", + "execution_count": 1647, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_text(model, start_string):\n", + " # Evaluation step (generating text using the learned model)\n", + "\n", + " # Number of characters to generate\n", + " num_generate = 100\n", + "\n", + " # Converting our start string to numbers (vectorizing)\n", + " input_eval = [ord(s) for s in start_string]\n", + " input_eval = tf.expand_dims(input_eval, 0)\n", + "\n", + " # Empty string to store our results\n", + " text_generated = []\n", + "\n", + " # Low temperatures results in more predictable text.\n", + " # Higher temperatures results in more surprising text.\n", + " # Experiment to find the best setting.\n", + " temperature = 1.0\n", + "\n", + " # Here batch size == 1\n", + " model.reset_states()\n", + " for i in range(num_generate):\n", + " predictions = model(input_eval)\n", + " # remove the batch dimension\n", + " predictions = tf.squeeze(predictions, 0)\n", + "\n", + " # using a categorical distribution to predict the character returned by the model\n", + " predictions = predictions / temperature\n", + " predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()\n", + "\n", + " # We pass the predicted character as the next input to the model\n", + " # along with the previous hidden state\n", + " input_eval = tf.expand_dims([predicted_id], 0)\n", + "\n", + " text_generated.append(chr(predicted_id))\n", + "\n", + " return (start_string + ''.join(text_generated))" + ] + }, + { + "cell_type": "code", + "execution_count": 1648, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "u\\F;q7z'^7XZFB]B2mLpx~f\u0012\u001fa-'?\u0014\u000e(*Q+u^4K\u001d", + "MC#\u00046]]:\u0007a.LS\u000b", + "8jO\u0004\n", + "$[\u001bf*J\u00071\u0017B#\u001a\u0013\n", + "\u0017$ILq&\u0003c\n" + ] + } + ], + "source": [ + "print(generate_text(model, start_string=\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "how much for the maple syrup 20.99 Thats ricidulous\n" + ] + } + ], + "source": [ + "\n", + "\n", + "strs = \"how much for the maple syrup? $%20.99? That's ”˚‑|–—=_ricidulous!!!???|||\"\n", + "\n", + "nstr = re.sub('[!,*)@=#-({|}_‑–?^;:{|}˚~\\t\\n“—’”/_]',r'',strs)\n", + "print(nstr)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save Trained Vectors of Embeddings and Model Weights" + ] + }, + { + "cell_type": "code", + "execution_count": 1646, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import io, csv\n", + "\n", + "# save model weights\n", + "e = model.layers[0]\n", + "weights = e.get_weights()[0]\n", + "weights.shape # shape: (characters/bytes, embedding_dim) -->(127,256)\n", + "\n", + "# save embeddings.\n", + "out_v = io.open('vecs.tsv', 'w', encoding='utf-8')\n", + "out_m = io.open('meta.tsv', 'w', encoding='utf-8')\n", + "tsv_writer = csv.writer(out_m, delimiter='\\t')\n", + "\n", + "# for char in map(chr, (i for i in range(1,255+1))):\n", + "# print(char)\n", + "for i in range(0,127):\n", + " vec = weights[i] \n", + " tsv_writer.writerow(str(chr(i+1)))\n", + "# out_m.write(chr(i+1), lineterminator='\\n')# skip 0, it's padding.255 last vector\n", + " out_v.write('\\t'.join([str(x) for x in vec]) + \"\\n\")\n", + "out_v.close()\n", + "out_m.close()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stock Direction Prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the output of a RNN layer contains a single vector per sample. This vector is the RNN cell output corresponding to the last timestep, containing information about the entire input sequence. The shape of this output is (batch_size, units) where units corresponds to the units argument passed to the layer's constructor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def direction_model(batch_size):\n", + " model = Sequential(name = 'RNNStocks')\n", + " model.add(Embedding(iinput_dim = 127, output_dim = 256,\n", + " batch_input_shape=[batch_size, None], \n", + " weights=[embedding_matrix],trainable=False, name ='EmbedLayer'))\n", + " model.add(Bidirectional(LSTM(1024, return_sequences=False,stateful=False,recurrent_initializer='glorot_uniform'), name = 'BiLSTM'))\n", + " #final state encodes full representation of passed headine\n", + " model.add(Dense(512, activation = 'LeakyReLU',name = 'FullConnected'))\n", + " model.add(Dense(1, name='Output', activation='sigmoid'))\n", + " model.compile(optimizer=tf.optimizers.ADAM(learning_rate=1e-3), loss = 'binary_crossentropy')\n", + " return model" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}