{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Show NumberAir DateRoundCategoryValueQuestionAnswer
046802004-12-31Jeopardy!HISTORY$200For the last 8 years of his life, Galileo was ...Copernicus
146802004-12-31Jeopardy!ESPN's TOP 10 ALL-TIME ATHLETES$200No. 2: 1912 Olympian; football star at Carlisl...Jim Thorpe
246802004-12-31Jeopardy!EVERYBODY TALKS ABOUT IT...$200The city of Yuma in this state has a record av...Arizona
346802004-12-31Jeopardy!THE COMPANY LINE$200In 1963, live on \"The Art Linkletter Show\", th...McDonald's
446802004-12-31Jeopardy!EPITAPHS & TRIBUTES$200Signer of the Dec. of Indep., framer of the Co...John Adams
........................
1999435822000-03-14Jeopardy!U.S. GEOGRAPHY$200Of 8, 12 or 18, the number of U.S. states that...18
1999535822000-03-14Jeopardy!POP MUSIC PAIRINGS$200...& the New Power GenerationPrince
1999635822000-03-14Jeopardy!HISTORIC PEOPLE$200In 1589 he was appointed professor of mathemat...Galileo
1999735822000-03-14Jeopardy!1998 QUOTATIONS$200Before the grand jury she said, \"I'm really so...Monica Lewinsky
1999835822000-03-14Jeopardy!LLAMA-RAMA$200Llamas are the heftiest South American members...Camels
\n", "

19999 rows × 7 columns

\n", "
" ], "text/plain": [ " Show Number Air Date Round Category \\\n", "0 4680 2004-12-31 Jeopardy! HISTORY \n", "1 4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES \n", "2 4680 2004-12-31 Jeopardy! EVERYBODY TALKS ABOUT IT... \n", "3 4680 2004-12-31 Jeopardy! THE COMPANY LINE \n", "4 4680 2004-12-31 Jeopardy! EPITAPHS & TRIBUTES \n", "... ... ... ... ... \n", "19994 3582 2000-03-14 Jeopardy! U.S. GEOGRAPHY \n", "19995 3582 2000-03-14 Jeopardy! POP MUSIC PAIRINGS \n", "19996 3582 2000-03-14 Jeopardy! HISTORIC PEOPLE \n", "19997 3582 2000-03-14 Jeopardy! 1998 QUOTATIONS \n", "19998 3582 2000-03-14 Jeopardy! LLAMA-RAMA \n", "\n", " Value Question \\\n", "0 $200 For the last 8 years of his life, Galileo was ... \n", "1 $200 No. 2: 1912 Olympian; football star at Carlisl... \n", "2 $200 The city of Yuma in this state has a record av... \n", "3 $200 In 1963, live on \"The Art Linkletter Show\", th... \n", "4 $200 Signer of the Dec. of Indep., framer of the Co... \n", "... ... ... \n", "19994 $200 Of 8, 12 or 18, the number of U.S. states that... \n", "19995 $200 ...& the New Power Generation \n", "19996 $200 In 1589 he was appointed professor of mathemat... \n", "19997 $200 Before the grand jury she said, \"I'm really so... \n", "19998 $200 Llamas are the heftiest South American members... \n", "\n", " Answer \n", "0 Copernicus \n", "1 Jim Thorpe \n", "2 Arizona \n", "3 McDonald's \n", "4 John Adams \n", "... ... \n", "19994 18 \n", "19995 Prince \n", "19996 Galileo \n", "19997 Monica Lewinsky \n", "19998 Camels \n", "\n", "[19999 rows x 7 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas\n", "import csv\n", "\n", "jeopardy = pandas.read_csv(\"jeopardy.csv\")\n", "\n", "jeopardy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',\n", " ' Question', ' Answer'],\n", " dtype='object')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jeopardy.columns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "jeopardy.columns = ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def normalize_text(text):\n", " text = text.lower()\n", " text = re.sub(\"[^A-Za-z0-9\\s]\", \"\", text)\n", " text = re.sub(\"\\s+\", \" \", text)\n", " return text\n", "\n", "def normalize_values(text):\n", " text = re.sub(\"[^A-Za-z0-9\\s]\", \"\", text)\n", " try:\n", " text = int(text)\n", " except Exception:\n", " text = 0\n", " return text" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "jeopardy[\"clean_question\"] = jeopardy[\"Question\"].apply(normalize_text)\n", "jeopardy[\"clean_answer\"] = jeopardy[\"Answer\"].apply(normalize_text)\n", "jeopardy[\"clean_value\"] = jeopardy[\"Value\"].apply(normalize_values)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Show NumberAir DateRoundCategoryValueQuestionAnswerclean_questionclean_answerclean_value
046802004-12-31Jeopardy!HISTORY$200For the last 8 years of his life, Galileo was ...Copernicusfor the last 8 years of his life galileo was u...copernicus200
146802004-12-31Jeopardy!ESPN's TOP 10 ALL-TIME ATHLETES$200No. 2: 1912 Olympian; football star at Carlisl...Jim Thorpeno 2 1912 olympian football star at carlisle i...jim thorpe200
246802004-12-31Jeopardy!EVERYBODY TALKS ABOUT IT...$200The city of Yuma in this state has a record av...Arizonathe city of yuma in this state has a record av...arizona200
346802004-12-31Jeopardy!THE COMPANY LINE$200In 1963, live on \"The Art Linkletter Show\", th...McDonald'sin 1963 live on the art linkletter show this c...mcdonalds200
446802004-12-31Jeopardy!EPITAPHS & TRIBUTES$200Signer of the Dec. of Indep., framer of the Co...John Adamssigner of the dec of indep framer of the const...john adams200
.................................
1999435822000-03-14Jeopardy!U.S. GEOGRAPHY$200Of 8, 12 or 18, the number of U.S. states that...18of 8 12 or 18 the number of us states that tou...18200
1999535822000-03-14Jeopardy!POP MUSIC PAIRINGS$200...& the New Power GenerationPrincethe new power generationprince200
1999635822000-03-14Jeopardy!HISTORIC PEOPLE$200In 1589 he was appointed professor of mathemat...Galileoin 1589 he was appointed professor of mathemat...galileo200
1999735822000-03-14Jeopardy!1998 QUOTATIONS$200Before the grand jury she said, \"I'm really so...Monica Lewinskybefore the grand jury she said im really sorry...monica lewinsky200
1999835822000-03-14Jeopardy!LLAMA-RAMA$200Llamas are the heftiest South American members...Camelsllamas are the heftiest south american members...camels200
\n", "

19999 rows × 10 columns

\n", "
" ], "text/plain": [ " Show Number Air Date Round Category \\\n", "0 4680 2004-12-31 Jeopardy! HISTORY \n", "1 4680 2004-12-31 Jeopardy! ESPN's TOP 10 ALL-TIME ATHLETES \n", "2 4680 2004-12-31 Jeopardy! EVERYBODY TALKS ABOUT IT... \n", "3 4680 2004-12-31 Jeopardy! THE COMPANY LINE \n", "4 4680 2004-12-31 Jeopardy! EPITAPHS & TRIBUTES \n", "... ... ... ... ... \n", "19994 3582 2000-03-14 Jeopardy! U.S. GEOGRAPHY \n", "19995 3582 2000-03-14 Jeopardy! POP MUSIC PAIRINGS \n", "19996 3582 2000-03-14 Jeopardy! HISTORIC PEOPLE \n", "19997 3582 2000-03-14 Jeopardy! 1998 QUOTATIONS \n", "19998 3582 2000-03-14 Jeopardy! LLAMA-RAMA \n", "\n", " Value Question \\\n", "0 $200 For the last 8 years of his life, Galileo was ... \n", "1 $200 No. 2: 1912 Olympian; football star at Carlisl... \n", "2 $200 The city of Yuma in this state has a record av... \n", "3 $200 In 1963, live on \"The Art Linkletter Show\", th... \n", "4 $200 Signer of the Dec. of Indep., framer of the Co... \n", "... ... ... \n", "19994 $200 Of 8, 12 or 18, the number of U.S. states that... \n", "19995 $200 ...& the New Power Generation \n", "19996 $200 In 1589 he was appointed professor of mathemat... \n", "19997 $200 Before the grand jury she said, \"I'm really so... \n", "19998 $200 Llamas are the heftiest South American members... \n", "\n", " Answer clean_question \\\n", "0 Copernicus for the last 8 years of his life galileo was u... \n", "1 Jim Thorpe no 2 1912 olympian football star at carlisle i... \n", "2 Arizona the city of yuma in this state has a record av... \n", "3 McDonald's in 1963 live on the art linkletter show this c... \n", "4 John Adams signer of the dec of indep framer of the const... \n", "... ... ... \n", "19994 18 of 8 12 or 18 the number of us states that tou... \n", "19995 Prince the new power generation \n", "19996 Galileo in 1589 he was appointed professor of mathemat... \n", "19997 Monica Lewinsky before the grand jury she said im really sorry... \n", "19998 Camels llamas are the heftiest south american members... \n", "\n", " clean_answer clean_value \n", "0 copernicus 200 \n", "1 jim thorpe 200 \n", "2 arizona 200 \n", "3 mcdonalds 200 \n", "4 john adams 200 \n", "... ... ... \n", "19994 18 200 \n", "19995 prince 200 \n", "19996 galileo 200 \n", "19997 monica lewinsky 200 \n", "19998 camels 200 \n", "\n", "[19999 rows x 10 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jeopardy" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "jeopardy[\"Air Date\"] = pandas.to_datetime(jeopardy[\"Air Date\"])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Show Number int64\n", "Air Date datetime64[ns]\n", "Round object\n", "Category object\n", "Value object\n", "Question object\n", "Answer object\n", "clean_question object\n", "clean_answer object\n", "clean_value int64\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jeopardy.dtypes" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def count_matches(row):\n", " split_answer = row[\"clean_answer\"].split()\n", " split_question = row[\"clean_question\"].split()\n", " if \"the\" in split_answer:\n", " split_answer.remove(\"the\")\n", " if len(split_answer) == 0:\n", " return 0\n", " match_count = 0\n", " for item in split_answer:\n", " if item in split_question:\n", " match_count += 1\n", " return match_count / len(split_answer)\n", "\n", "jeopardy[\"answer_in_question\"] = jeopardy.apply(count_matches, axis=1)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.059001965249777744" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jeopardy[\"answer_in_question\"].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Recycled questions\n", "\n", "The answer only appears in the question about `6%` of the time. This isn't a huge number, and means that we probably can't just hope that hearing a question will enable us to figure out the answer. We'll probably have to study." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6876260592169776" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "question_overlap = []\n", "terms_used = set()\n", "\n", "jeopardy = jeopardy.sort_values(\"Air Date\")\n", "\n", "for i, row in jeopardy.iterrows():\n", " split_question = row[\"clean_question\"].split(\" \")\n", " split_question = [q for q in split_question if len(q) > 5]\n", " match_count = 0\n", " for word in split_question:\n", " if word in terms_used:\n", " match_count += 1\n", " for word in split_question:\n", " terms_used.add(word)\n", " if len(split_question) > 0:\n", " match_count /= len(split_question)\n", " question_overlap.append(match_count)\n", "jeopardy[\"question_overlap\"] = question_overlap\n", "\n", "jeopardy[\"question_overlap\"].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Low value vs high value questions\n", "\n", "There is about `70%` overlap between terms in new questions and terms in old questions. This only looks at a small set of questions, and it doesn't look at phrases, it looks at single terms. This makes it relatively insignificant, but it does mean that it's worth looking more into the recycling of questions." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def determine_value(row):\n", " value = 0\n", " if row[\"clean_value\"] > 800:\n", " value = 1\n", " return value\n", "\n", "jeopardy[\"high_value\"] = jeopardy.apply(determine_value, axis=1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def count_usage(term):\n", " low_count = 0\n", " high_count = 0\n", " for i, row in jeopardy.iterrows():\n", " if term in row[\"clean_question\"].split(\" \"):\n", " if row[\"high_value\"] == 1:\n", " high_count += 1\n", " else:\n", " low_count += 1\n", " return high_count, low_count" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1),\n", " (1, 0),\n", " (0, 1),\n", " (1, 0),\n", " (1, 0),\n", " (0, 2),\n", " (3, 8),\n", " (1, 0),\n", " (0, 1),\n", " (0, 1)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from random import choice\n", "\n", "terms_used_list = list(terms_used)\n", "comparison_terms = [choice(terms_used_list) for _ in range(10)]\n", "\n", "observed_expected = []\n", "\n", "for term in comparison_terms:\n", " observed_expected.append(count_usage(term))\n", "\n", "observed_expected" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),\n", " Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),\n", " Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),\n", " Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),\n", " Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),\n", " Power_divergenceResult(statistic=0.803925692253768, pvalue=0.3699222378079571),\n", " Power_divergenceResult(statistic=0.01052283698924083, pvalue=0.9182956181393399),\n", " Power_divergenceResult(statistic=2.487792117195675, pvalue=0.11473257634454047),\n", " Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469),\n", " Power_divergenceResult(statistic=0.401962846126884, pvalue=0.5260772985705469)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.stats import chisquare\n", "import numpy as np\n", "\n", "high_value_count = jeopardy[jeopardy[\"high_value\"] == 1].shape[0]\n", "low_value_count = jeopardy[jeopardy[\"high_value\"] == 0].shape[0]\n", "\n", "chi_squared = []\n", "for obs in observed_expected:\n", " total = sum(obs)\n", " total_prop = total / jeopardy.shape[0]\n", " high_value_exp = total_prop * high_value_count\n", " low_value_exp = total_prop * low_value_count\n", " \n", " observed = np.array([obs[0], obs[1]])\n", " expected = np.array([high_value_exp, low_value_exp])\n", " chi_squared.append(chisquare(observed, expected))\n", "\n", "chi_squared" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chi-squared results\n", "\n", "None of the terms had a significant difference in usage between high value and low value rows. Additionally, the frequencies were all lower than `5`, so the chi-squared test isn't as valid. It would be better to run this test with only terms that have higher frequencies." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }