import pandas as pd
jeopardy = pd.read_csv("Data/JEOPARDY.csv")
print(jeopardy.head())
print(jeopardy.columns)
this dataset contains 216,930 Jeopardy questions, answers and other data across 22 years of air time. Questions were obtained by crawling www.j-archive.com done by redditor trexmatt
#remove the space in column names:
jeopardy.columns = ["Show Number", "Air Date", "Round", "Category", "Value", "Question", "Answer"]
print(jeopardy.columns)
#function to remove punctuation, turn words into lowercase
from string import punctuation
print(punctuation)
def cleanstr(string):
for p in punctuation:
string = string.replace(p, '')
return string.lower()
#assign cleaned columns back to jeopardy
jeopardy["clean_question"] = jeopardy["Question"].apply(cleanstr)
jeopardy["clean_answer"] = jeopardy["Answer"].apply(cleanstr)
print(jeopardy.head())
#function to convert string to int for value
def convert(string):
string = cleanstr(string)
try:
value = int(string)
except Exception:
value = 0
return value
#assign back to the df
jeopardy["clean_value"] = jeopardy["Value"].apply(convert)
#convert air date to datetime format
jeopardy["Air Date"] = pd.to_datetime(jeopardy["Air Date"])
print(jeopardy.head())
print(jeopardy["Air Date"].dtype)
#function to find match between answers and question
def match(row):
split_answer = row["clean_answer"].split(" ")
split_question = row["clean_question"].split(" ")
match_count = 0
if "the" in split_answer:
split_answer.remove("the")
if len(split_answer) == 0:
return 0
else:
for i in split_answer:
if i in split_question:
match_count += 1
return match_count/len(split_answer)
#apply function and assign result to answer_in_question column:
jeopardy["answer_in_question"] = jeopardy.apply(match, axis = 1)
print(jeopardy["answer_in_question"].mean())
print(jeopardy.head())
There is a little less than 6% of the chance we can hear the answer in the question, implying that study would be necessary to score high in jeopardy
jeo = jeopardy.sort_values("Air Date")
print(jeo.head(20))
print(jeo.shape)
terms_used = set()
question_overlap = []
for i, row in jeo.iterrows():
split_question = row["clean_question"].split(" ")
split_question = [word for word in split_question if len(word) > 5 and len(word) < 15]
match_count = 0
for word in split_question:
if word in terms_used:
match_count += 1
terms_used.add(word)
if len(split_question) > 0:
match_count /= len(split_question)
question_overlap.append(match_count)
jeo["question_overlap"] = question_overlap
print(jeo["question_overlap"].mean())
The rate appears to be very high, honestly it's only normal with 200000 rows of data, when it towards the later, many words are bound to be repeated many times since we do not pay attention to the order of the words and how they connect with each other.
def value(row):
if row["clean_value"] > 800:
value = 1
else:
value = 0
return value
jeo["high_value"] = jeo.apply(value, axis = 1)
print(jeo.head())
jeo.drop(["Question", "Answer", "Value"], axis = 1, inplace = True)
print(jeo.head())
def valuecount(word):
low_count = 0
high_count = 0
for i, row in jeo.iterrows():
if word in row["clean_question"].split(" "):
if row["high_value"] == 1:
high_count += 1
else:
low_count += 1
return high_count, low_count
obs_exp = []
terms_used = list(terms_used)
comparison_terms = terms_used[:5]
for term in comparison_terms:
obs_exp.append(valuecount(term))
print(obs_exp)
print('a')
print(comparison_terms)
import numpy as np
from scipy.stats import chisquare
high_value_count = jeo[jeo["high_value"] == 1].shape[0]
low_value_count = jeo[jeo["high_value"] == 0].shape[0]
print(high_value_count, low_value_count)
chi_squared = []
for item in obs_exp:
total = item[0] + item[1]
total_prop = total/jeo.shape[0]
print(total, total_prop)
exp_high_value = total_prop * high_value_count
exp_low_value = total_prop * low_value_count
print(exp_high_value, exp_low_value)
obs = np.array([item[0],item[1]])
exp = np.array([exp_high_value, exp_low_value])
chi_squared.append(chisquare(obs, exp))
chi_squared
None of the pvalue < 0.05, so all the results are statistically insignifant. From the five words we tested, there are no clear sign that the word occurence differ much in high and low value questions.