In [2]:

import pandas as pd
data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]
data = {}

datasets used in this project come from NYC Department of Education

In [3]:

for f in data_files:
    d = pd.read_csv("/Users/33Phoebe/Documents/OneDrive/Data Scientist Path/Data Sets/NYC School Data/{0}".format(f))
    key_name = f.replace(".csv", "")
    data[key_name] = d

In [5]:

all_survey = pd.read_csv("/Users/33Phoebe/Documents/OneDrive/Data Scientist Path/Data Sets/NYC School Data/survey_all.txt", delimiter = "\t", encoding = "windows-1252")
d75_survey = pd.read_csv("/Users/33Phoebe/Documents/OneDrive/Data Scientist Path/Data Sets/NYC School Data/survey_d75.txt", delimiter = "\t", encoding = "windows-1252")
#combine two df one after another
survey = pd.concat([all_survey, d75_survey], axis = 0)
all_survey.head()

Out[5]:

	dbn	bn	schoolname	studentssurveyed	schooltype	rr_s	rr_t	rr_p	...	s_N_q14e_3	s_N_q14e_4	s_N_q14f_1	s_N_q14f_2	s_N_q14f_3	s_N_q14f_4	s_N_q14g_1	s_N_q14g_2	s_N_q14g_3	s_N_q14g_4
0	01M015	M015	P.S. 015 Roberto Clemente	No	Elementary School	NaN	88	60	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	01M019	M019	P.S. 019 Asher Levy	No	Elementary School	NaN	100	60	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	01M020	M020	P.S. 020 Anna Silver	No	Elementary School	NaN	88	73	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	01M034	M034	P.S. 034 Franklin D. Roosevelt	Yes	Elementary / Middle School	89.0	73	50	...	20.0	16.0	23.0	54.0	33.0	29.0	31.0	46.0	16.0	8.0
4	01M063	M063	P.S. 063 William McKinley	No	Elementary School	NaN	100	60	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 1942 columns

In [6]:

#rename dbn to DBN and select relevant variables
survey["DBN"] = survey["dbn"]
survey = survey[["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]]
data["survey"] = survey
print(data["survey"].shape)

(1702, 23)

In [10]:

for key, value in data.items():
    print(key)
    print(value.head())

ap_2010
      DBN                             SchoolName AP Test Takers   \
0  01M448           UNIVERSITY NEIGHBORHOOD H.S.              39   
1  01M450                 EAST SIDE COMMUNITY HS              19   
2  01M515                    LOWER EASTSIDE PREP              24   
3  01M539         NEW EXPLORATIONS SCI,TECH,MATH             255   
4  02M296  High School of Hospitality Management               s   

  Total Exams Taken Number of Exams with scores 3 4 or 5  
0                49                                   10  
1                21                                    s  
2                26                                   24  
3               377                                  191  
4                 s                                    s  
class_size
   CSD BOROUGH SCHOOL CODE                SCHOOL NAME GRADE  PROGRAM TYPE  \
0    1       M        M015  P.S. 015 Roberto Clemente     0K       GEN ED   
1    1       M        M015  P.S. 015 Roberto Clemente     0K          CTT   
2    1       M        M015  P.S. 015 Roberto Clemente     01       GEN ED   
3    1       M        M015  P.S. 015 Roberto Clemente     01          CTT   
4    1       M        M015  P.S. 015 Roberto Clemente     02       GEN ED   

  CORE SUBJECT (MS CORE and 9-12 ONLY) CORE COURSE (MS CORE and 9-12 ONLY)  \
0                                    -                                   -   
1                                    -                                   -   
2                                    -                                   -   
3                                    -                                   -   
4                                    -                                   -   

  SERVICE CATEGORY(K-9* ONLY)  NUMBER OF STUDENTS / SEATS FILLED  \
0                           -                               19.0   
1                           -                               21.0   
2                           -                               17.0   
3                           -                               17.0   
4                           -                               15.0   

   NUMBER OF SECTIONS  AVERAGE CLASS SIZE  SIZE OF SMALLEST CLASS  \
0                 1.0                19.0                    19.0   
1                 1.0                21.0                    21.0   
2                 1.0                17.0                    17.0   
3                 1.0                17.0                    17.0   
4                 1.0                15.0                    15.0   

   SIZE OF LARGEST CLASS DATA SOURCE  SCHOOLWIDE PUPIL-TEACHER RATIO  
0                   19.0         ATS                             NaN  
1                   21.0         ATS                             NaN  
2                   17.0         ATS                             NaN  
3                   17.0         ATS                             NaN  
4                   15.0         ATS                             NaN  
demographics
      DBN                       Name  schoolyear fl_percent  frl_percent  \
0  01M015  P.S. 015 ROBERTO CLEMENTE    20052006       89.4          NaN   
1  01M015  P.S. 015 ROBERTO CLEMENTE    20062007       89.4          NaN   
2  01M015  P.S. 015 ROBERTO CLEMENTE    20072008       89.4          NaN   
3  01M015  P.S. 015 ROBERTO CLEMENTE    20082009       89.4          NaN   
4  01M015  P.S. 015 ROBERTO CLEMENTE    20092010                    96.5   

   total_enrollment prek   k grade1 grade2    ...     black_num black_per  \
0               281   15  36     40     33    ...            74      26.3   
1               243   15  29     39     38    ...            68      28.0   
2               261   18  43     39     36    ...            77      29.5   
3               252   17  37     44     32    ...            75      29.8   
4               208   16  40     28     32    ...            67      32.2   

  hispanic_num hispanic_per white_num white_per male_num male_per female_num  \
0          189         67.3         5       1.8    158.0     56.2      123.0   
1          153         63.0         4       1.6    140.0     57.6      103.0   
2          157         60.2         7       2.7    143.0     54.8      118.0   
3          149         59.1         7       2.8    149.0     59.1      103.0   
4          118         56.7         6       2.9    124.0     59.6       84.0   

  female_per  
0       43.8  
1       42.4  
2       45.2  
3       40.9  
4       40.4  

[5 rows x 38 columns]
graduation
    Demographic     DBN                            School Name    Cohort  \
0  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2003   
1  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2004   
2  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2005   
3  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL      2006   
4  Total Cohort  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL  2006 Aug   

   Total Cohort Total Grads - n Total Grads - % of cohort Total Regents - n  \
0             5               s                         s                 s   
1            55              37                     67.3%                17   
2            64              43                     67.2%                27   
3            78              43                     55.1%                36   
4            78              44                     56.4%                37   

  Total Regents - % of cohort Total Regents - % of grads  \
0                           s                          s   
1                       30.9%                      45.9%   
2                       42.2%                      62.8%   
3                       46.2%                      83.7%   
4                       47.4%                      84.1%   

             ...            Regents w/o Advanced - n  \
0            ...                                   s   
1            ...                                  17   
2            ...                                  27   
3            ...                                  36   
4            ...                                  37   

  Regents w/o Advanced - % of cohort Regents w/o Advanced - % of grads  \
0                                  s                                 s   
1                              30.9%                             45.9%   
2                              42.2%                             62.8%   
3                              46.2%                             83.7%   
4                              47.4%                             84.1%   

  Local - n Local - % of cohort   Local - % of grads Still Enrolled - n  \
0         s                   s                    s                  s   
1        20               36.4%                54.1%                 15   
2        16                 25%  37.200000000000003%                  9   
3         7                  9%                16.3%                 16   
4         7                  9%                15.9%                 15   

  Still Enrolled - % of cohort Dropped Out - n Dropped Out - % of cohort  
0                            s               s                         s  
1                        27.3%               3                      5.5%  
2                        14.1%               9                     14.1%  
3                        20.5%              11                     14.1%  
4                        19.2%              11                     14.1%  

[5 rows x 23 columns]
hs_directory
      dbn                                        school_name    borough  \
0  17K548                Brooklyn School for Music & Theatre   Brooklyn   
1  09X543                   High School for Violin and Dance      Bronx   
2  09X327        Comprehensive Model School Project M.S. 327      Bronx   
3  02M280     Manhattan Early College School for Advertising  Manhattan   
4  28Q680  Queens Gateway to Health Sciences Secondary Sc...     Queens   

  building_code    phone_number    fax_number  grade_span_min  grade_span_max  \
0          K440    718-230-6250  718-230-6262             9.0              12   
1          X400    718-842-0687  718-589-9849             9.0              12   
2          X240    718-294-8111  718-294-8109             6.0              12   
3          M520  718-935-3477             NaN             9.0              10   
4          Q695    718-969-3155  718-969-3552             6.0              12   

   expgrade_span_min  expgrade_span_max  \
0                NaN                NaN   
1                NaN                NaN   
2                NaN                NaN   
3                9.0               14.0   
4                NaN                NaN   

                         ...                         priority08 priority09  \
0                        ...                                NaN        NaN   
1                        ...                                NaN        NaN   
2                        ...                                NaN        NaN   
3                        ...                                NaN        NaN   
4                        ...                                NaN        NaN   

  priority10                                         Location 1  \
0        NaN  883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...   
1        NaN  1110 Boston Road\nBronx, NY 10456\n(40.8276026...   
2        NaN  1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...   
3        NaN  411 Pearl Street\nNew York, NY 10038\n(40.7106...   
4        NaN  160 20 Goethals Avenue\nJamaica, NY 11432\n(40...   

  Community Board  Council District Census Tract        BIN           BBL  \
0             9.0              35.0        213.0  3029686.0  3.011870e+09   
1             3.0              16.0        135.0  2004526.0  2.026340e+09   
2             4.0              14.0        209.0  2008336.0  2.028590e+09   
3             1.0               1.0         29.0  1001388.0  1.001130e+09   
4             8.0              24.0       1267.0  4539721.0  4.068580e+09   

                                                 NTA  
0  Crown Heights South                           ...  
1  Morrisania-Melrose                            ...  
2  West Concourse                                ...  
3  Chinatown                                     ...  
4  Pomonok-Flushing Heights-Hillcrest            ...  

[5 rows x 64 columns]
sat_results
      DBN                                    SCHOOL NAME  \
0  01M292  HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448            UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                     EAST SIDE COMMUNITY SCHOOL   
3  01M458                      FORSYTH SATELLITE ACADEMY   
4  01M509                        MARTA VALLE HIGH SCHOOL   

  Num of SAT Test Takers SAT Critical Reading Avg. Score SAT Math Avg. Score  \
0                     29                             355                 404   
1                     91                             383                 423   
2                     70                             377                 402   
3                      7                             414                 401   
4                     44                             390                 433   

  SAT Writing Avg. Score  
0                    363  
1                    366  
2                    370  
3                    359  
4                    384  
survey
      DBN  rr_s  rr_t  rr_p    N_s   N_t    N_p  saf_p_11  com_p_11  eng_p_11  \
0  01M015   NaN    88    60    NaN  22.0   90.0       8.5       7.6       7.5   
1  01M019   NaN   100    60    NaN  34.0  161.0       8.4       7.6       7.6   
2  01M020   NaN    88    73    NaN  42.0  367.0       8.9       8.3       8.3   
3  01M034  89.0    73    50  145.0  29.0  151.0       8.8       8.2       8.0   
4  01M063   NaN   100    60    NaN  23.0   90.0       8.7       7.9       8.1   

      ...      eng_t_11  aca_t_11  saf_s_11  com_s_11  eng_s_11  aca_s_11  \
0     ...           7.6       7.9       NaN       NaN       NaN       NaN   
1     ...           8.9       9.1       NaN       NaN       NaN       NaN   
2     ...           6.8       7.5       NaN       NaN       NaN       NaN   
3     ...           6.8       7.8       6.2       5.9       6.5       7.4   
4     ...           7.8       8.1       NaN       NaN       NaN       NaN   

   saf_tot_11  com_tot_11  eng_tot_11  aca_tot_11  
0         8.0         7.7         7.5         7.9  
1         8.5         8.1         8.2         8.4  
2         8.2         7.3         7.5         8.0  
3         7.3         6.7         7.1         7.9  
4         8.5         7.6         7.9         8.0  

[5 rows x 23 columns]

data cleaning: make the key across datasets consistent¶

In [11]:

data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
#function to pad 0 for single digit CSD in class_size
def pad(num):
    numstr = str(num)
    if len(numstr) == 2:
        return numstr
    return numstr.zfill(2) #pad 0 to single digit num
data["class_size"]["padCSD"] = data["class_size"]["CSD"].apply(pad)
#create DBN by concatenate padCSD and school code columns
data["class_size"]["DBN"] = data["class_size"]["padCSD"] + data["class_size"]["SCHOOL CODE"]
data["class_size"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27611 entries, 0 to 27610
Data columns (total 18 columns):
CSD                                     27611 non-null int64
BOROUGH                                 27611 non-null object
SCHOOL CODE                             27611 non-null object
SCHOOL NAME                             27611 non-null object
GRADE                                   26127 non-null object
PROGRAM TYPE                            26127 non-null object
CORE SUBJECT (MS CORE and 9-12 ONLY)    26127 non-null object
CORE COURSE (MS CORE and 9-12 ONLY)     26127 non-null object
SERVICE CATEGORY(K-9* ONLY)             26127 non-null object
NUMBER OF STUDENTS / SEATS FILLED       26127 non-null float64
NUMBER OF SECTIONS                      26127 non-null float64
AVERAGE CLASS SIZE                      26127 non-null float64
SIZE OF SMALLEST CLASS                  26127 non-null float64
SIZE OF LARGEST CLASS                   26127 non-null float64
DATA SOURCE                             26127 non-null object
SCHOOLWIDE PUPIL-TEACHER RATIO          1484 non-null float64
padCSD                                  27611 non-null object
DBN                                     27611 non-null object
dtypes: float64(6), int64(1), object(11)
memory usage: 3.8+ MB

In [15]:

#combine three scores into one
scores = ["SAT Math Avg. Score", "SAT Critical Reading Avg. Score", "SAT Writing Avg. Score"]
for score in scores:
    data["sat_results"][score] = pd.to_numeric(data["sat_results"][score], errors = "coerce")
data["sat_results"]["sat_score"] = data["sat_results"][scores[0]] + data["sat_results"][scores[1]] + data["sat_results"][scores[2]]
data["sat_results"].head()

Out[15]:

	DBN	SCHOOL NAME	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	sat_score
0	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES	29	355.0	404.0	363.0	1122.0
1	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	91	383.0	423.0	366.0	1172.0
2	01M450	EAST SIDE COMMUNITY SCHOOL	70	377.0	402.0	370.0	1149.0
3	01M458	FORSYTH SATELLITE ACADEMY	7	414.0	401.0	359.0	1174.0
4	01M509	MARTA VALLE HIGH SCHOOL	44	390.0	433.0	384.0	1207.0

In [13]:

#coordinate values are imbedded in the "Location 1" variable of "hs_directory", need to make it stand-alone
import re
def strip(string):
    coordinates = re.findall("\(.+\)", string)
    lat = coordinates[0].split(",")[0].replace("(", "")
    return lat
#strip out the latitudes from the coordinates in the Location 1 col
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(strip)
data["hs_directory"].head()

Out[13]:

	dbn	school_name	borough	building_code	phone_number	fax_number	grade_span_min	grade_span_max	expgrade_span_min	expgrade_span_max	...	priority10	Location 1	Community Board	Council District	Census Tract	BIN	BBL	NTA	DBN	lat
0	17K548	Brooklyn School for Music & Theatre	Brooklyn	K440	718-230-6250	718-230-6262	9.0	12	NaN	NaN	...	NaN	883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...	9.0	35.0	213.0	3029686.0	3.011870e+09	Crown Heights South ...	17K548	40.67029890700047
1	09X543	High School for Violin and Dance	Bronx	X400	718-842-0687	718-589-9849	9.0	12	NaN	NaN	...	NaN	1110 Boston Road\nBronx, NY 10456\n(40.8276026...	3.0	16.0	135.0	2004526.0	2.026340e+09	Morrisania-Melrose ...	09X543	40.8276026690005
2	09X327	Comprehensive Model School Project M.S. 327	Bronx	X240	718-294-8111	718-294-8109	6.0	12	NaN	NaN	...	NaN	1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...	4.0	14.0	209.0	2008336.0	2.028590e+09	West Concourse ...	09X327	40.842414068000494
3	02M280	Manhattan Early College School for Advertising	Manhattan	M520	718-935-3477	NaN	9.0	10	9.0	14.0	...	NaN	411 Pearl Street\nNew York, NY 10038\n(40.7106...	1.0	1.0	29.0	1001388.0	1.001130e+09	Chinatown ...	02M280	40.71067947100045
4	28Q680	Queens Gateway to Health Sciences Secondary Sc...	Queens	Q695	718-969-3155	718-969-3552	6.0	12	NaN	NaN	...	NaN	160 20 Goethals Avenue\nJamaica, NY 11432\n(40...	8.0	24.0	1267.0	4539721.0	4.068580e+09	Pomonok-Flushing Heights-Hillcrest ...	28Q680	40.718810094000446

5 rows × 66 columns

In [25]:

#strip lon and convert both cols to numeric values
def strip_lon(string):
    coordinates = re.findall("\(.+\)", string)
    lat = coordinates[0].split(",")[1].replace(")", "")
    return lat
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(strip_lon)
coor = ["lat", "lon"]
directory = data["hs_directory"]
for c in coor:
    directory[c] = pd.to_numeric(directory[c], errors = "coerce")
directory.head()

Out[25]:

	dbn	school_name	borough	building_code	phone_number	fax_number	grade_span_min	grade_span_max	expgrade_span_min	expgrade_span_max	...	Location 1	Community Board	Council District	Census Tract	BIN	BBL	NTA	DBN	lat	lon
0	17K548	Brooklyn School for Music & Theatre	Brooklyn	K440	718-230-6250	718-230-6262	9.0	12	NaN	NaN	...	883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...	9.0	35.0	213.0	3029686.0	3.011870e+09	Crown Heights South ...	17K548	40.670299	-73.961648
1	09X543	High School for Violin and Dance	Bronx	X400	718-842-0687	718-589-9849	9.0	12	NaN	NaN	...	1110 Boston Road\nBronx, NY 10456\n(40.8276026...	3.0	16.0	135.0	2004526.0	2.026340e+09	Morrisania-Melrose ...	09X543	40.827603	-73.904475
2	09X327	Comprehensive Model School Project M.S. 327	Bronx	X240	718-294-8111	718-294-8109	6.0	12	NaN	NaN	...	1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...	4.0	14.0	209.0	2008336.0	2.028590e+09	West Concourse ...	09X327	40.842414	-73.916162
3	02M280	Manhattan Early College School for Advertising	Manhattan	M520	718-935-3477	NaN	9.0	10	9.0	14.0	...	411 Pearl Street\nNew York, NY 10038\n(40.7106...	1.0	1.0	29.0	1001388.0	1.001130e+09	Chinatown ...	02M280	40.710679	-74.000807
4	28Q680	Queens Gateway to Health Sciences Secondary Sc...	Queens	Q695	718-969-3155	718-969-3552	6.0	12	NaN	NaN	...	160 20 Goethals Avenue\nJamaica, NY 11432\n(40...	8.0	24.0	1267.0	4539721.0	4.068580e+09	Pomonok-Flushing Heights-Hillcrest ...	28Q680	40.718810	-73.806500

5 rows × 67 columns

In [31]:

data["class_size"].head()
#variables "GRADE " and "PROGRAM TYPE" have many redundant or missing values that are not needed in this analysis
print(data["class_size"]["GRADE "].unique())
print(data["class_size"]["PROGRAM TYPE"].unique())

['0K' '01' '02' '03' '04' '05' '0K-09' nan '06' '07' '08' 'MS Core' '09-12'
 '09']
['GEN ED' 'CTT' 'SPEC ED' nan 'G&T']

In [29]:

class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size.head()

Out[29]:

	CSD	BOROUGH	SCHOOL CODE	SCHOOL NAME	GRADE	PROGRAM TYPE	CORE SUBJECT (MS CORE and 9-12 ONLY)	CORE COURSE (MS CORE and 9-12 ONLY)	SERVICE CATEGORY(K-9* ONLY)	NUMBER OF STUDENTS / SEATS FILLED	NUMBER OF SECTIONS	AVERAGE CLASS SIZE	SIZE OF SMALLEST CLASS	SIZE OF LARGEST CLASS	DATA SOURCE	SCHOOLWIDE PUPIL-TEACHER RATIO	padCSD	DBN
225	1	M	M292	Henry Street School for International Studies	09-12	GEN ED	ENGLISH	English 9	-	63.0	3.0	21.0	19.0	25.0	STARS	NaN	01	01M292
226	1	M	M292	Henry Street School for International Studies	09-12	GEN ED	ENGLISH	English 10	-	79.0	3.0	26.3	24.0	31.0	STARS	NaN	01	01M292
227	1	M	M292	Henry Street School for International Studies	09-12	GEN ED	ENGLISH	English 11	-	38.0	2.0	19.0	16.0	22.0	STARS	NaN	01	01M292
228	1	M	M292	Henry Street School for International Studies	09-12	GEN ED	ENGLISH	English 12	-	69.0	3.0	23.0	13.0	30.0	STARS	NaN	01	01M292
229	1	M	M292	Henry Street School for International Studies	09-12	GEN ED	MATH	Integrated Algebra	-	53.0	3.0	17.7	16.0	21.0	STARS	NaN	01	01M292

In [32]:

#condense the class_size data and make the DBN unique
import numpy
dbn_group = class_size.groupby("DBN", as_index = False)
class_size = dbn_group.agg(numpy.mean)
data["class_size"] = class_size
data["class_size"].head()

Out[32]:

	DBN	CSD	NUMBER OF STUDENTS / SEATS FILLED	NUMBER OF SECTIONS	AVERAGE CLASS SIZE	SIZE OF SMALLEST CLASS	SIZE OF LARGEST CLASS	SCHOOLWIDE PUPIL-TEACHER RATIO
0	01M292	1	88.0000	4.000000	22.564286	18.50	26.571429	NaN
1	01M332	1	46.0000	2.000000	22.000000	21.00	23.500000	NaN
2	01M378	1	33.0000	1.000000	33.000000	33.00	33.000000	NaN
3	01M448	1	105.6875	4.750000	22.231250	18.25	27.062500	NaN
4	01M450	1	57.6000	2.733333	21.200000	19.40	22.866667	NaN

In [33]:

#condense demographics data by selecting the corresponding school year 20112012
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["demographics"].head()

Out[33]:

	DBN	Name	schoolyear	fl_percent	frl_percent	total_enrollment	prek	k	grade1	grade2	...	black_num	black_per	hispanic_num	hispanic_per	white_num	white_per	male_num	male_per	female_num	female_per
6	01M015	P.S. 015 ROBERTO CLEMENTE	20112012	NaN	89.4	189	13	31	35	28	...	63	33.3	109	57.7	4	2.1	97.0	51.3	92.0	48.7
13	01M019	P.S. 019 ASHER LEVY	20112012	NaN	61.5	328	32	46	52	54	...	81	24.7	158	48.2	28	8.5	147.0	44.8	181.0	55.2
20	01M020	PS 020 ANNA SILVER	20112012	NaN	92.5	626	52	102	121	87	...	55	8.8	357	57.0	16	2.6	330.0	52.7	296.0	47.3
27	01M034	PS 034 FRANKLIN D ROOSEVELT	20112012	NaN	99.7	401	14	34	38	36	...	90	22.4	275	68.6	8	2.0	204.0	50.9	197.0	49.1
35	01M063	PS 063 WILLIAM MCKINLEY	20112012	NaN	78.9	176	18	20	30	21	...	41	23.3	110	62.5	15	8.5	97.0	55.1	79.0	44.9

5 rows × 38 columns

In [34]:

#condense graduation data
grad = data["graduation"]
grad = grad[grad["Cohort"] == '2006']
grad = grad[grad["Demographic"] == "Total Cohort"]
data["graduation"] = grad
grad.head()

Out[34]:

	Demographic	DBN	School Name	Cohort	Total Cohort	Total Grads - n	Total Grads - % of cohort	Total Regents - n	Total Regents - % of cohort	Total Regents - % of grads	...	Regents w/o Advanced - n	Regents w/o Advanced - % of cohort	Regents w/o Advanced - % of grads	Local - n	Local - % of cohort	Local - % of grads	Still Enrolled - n	Still Enrolled - % of cohort	Dropped Out - n	Dropped Out - % of cohort
3	Total Cohort	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL	2006	78	43	55.1%	36	46.2%	83.7%	...	36	46.2%	83.7%	7	9%	16.3%	16	20.5%	11	14.1%
10	Total Cohort	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	2006	124	53	42.7%	42	33.9%	79.2%	...	34	27.4%	64.2%	11	8.9%	20.8%	46	37.1%	20	16.100000000000001%
17	Total Cohort	01M450	EAST SIDE COMMUNITY SCHOOL	2006	90	70	77.8%	67	74.400000000000006%	95.7%	...	67	74.400000000000006%	95.7%	3	3.3%	4.3%	15	16.7%	5	5.6%
24	Total Cohort	01M509	MARTA VALLE HIGH SCHOOL	2006	84	47	56%	40	47.6%	85.1%	...	23	27.4%	48.9%	7	8.300000000000001%	14.9%	25	29.8%	5	6%
31	Total Cohort	01M515	LOWER EAST SIDE PREPARATORY HIGH SCHO	2006	193	105	54.4%	91	47.2%	86.7%	...	22	11.4%	21%	14	7.3%	13.3%	53	27.5%	35	18.100000000000001%

5 rows × 23 columns

In [36]:

#work on ap_2010 datasets
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
ap = data["ap_2010"]
for col in cols:
    ap[col] = pd.to_numeric(ap[col], errors = "coerce")
    print(ap[col].dtypes)

float64
float64
float64

Combine datasets¶

In [38]:

#combine sat_results with ap_2010 and graduation by left join
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], how = 'left', on = "DBN")
combined = combined.merge(data["graduation"], how = 'left', on = "DBN")
combined.shape

Out[38]:

(479, 33)

In [39]:

#inner join the other datasets
dfs = [data["class_size"], data["demographics"], data["survey"], data["hs_directory"]]
for df in dfs:
    combined = combined.merge(df, how = "inner", on = "DBN")
print(combined.head())
combined.shape

      DBN                                        SCHOOL NAME  \
0  01M292      HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES   
1  01M448                UNIVERSITY NEIGHBORHOOD HIGH SCHOOL   
2  01M450                         EAST SIDE COMMUNITY SCHOOL   
3  01M509                            MARTA VALLE HIGH SCHOOL   
4  01M539  NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...   

  Num of SAT Test Takers  SAT Critical Reading Avg. Score  \
0                     29                            355.0   
1                     91                            383.0   
2                     70                            377.0   
3                     44                            390.0   
4                    159                            522.0   

   SAT Math Avg. Score  SAT Writing Avg. Score  sat_score  \
0                404.0                   363.0     1122.0   
1                423.0                   366.0     1172.0   
2                402.0                   370.0     1149.0   
3                433.0                   384.0     1207.0   
4                574.0                   525.0     1621.0   

                       SchoolName  AP Test Takers   Total Exams Taken  \
0                             NaN              NaN                NaN   
1    UNIVERSITY NEIGHBORHOOD H.S.             39.0               49.0   
2          EAST SIDE COMMUNITY HS             19.0               21.0   
3                             NaN              NaN                NaN   
4  NEW EXPLORATIONS SCI,TECH,MATH            255.0              377.0   

     ...      priority10                                         Location 1  \
0    ...             NaN  220 Henry Street\nNew York, NY 10002\n(40.7137...   
1    ...             NaN  200 Monroe Street\nNew York, NY 10002\n(40.712...   
2    ...             NaN  420 East 12 Street\nNew York, NY 10009\n(40.72...   
3    ...             NaN  145 Stanton Street\nNew York, NY 10002\n(40.72...   
4    ...             NaN  111 Columbia Street\nNew York, NY 10002\n(40.7...   

  Community Board Council District  Census Tract        BIN           BBL  \
0             3.0              1.0         201.0  1003223.0  1.002690e+09   
1             3.0              1.0         202.0  1003214.0  1.002590e+09   
2             3.0              2.0          34.0  1005974.0  1.004390e+09   
3             3.0              1.0        3001.0  1004323.0  1.003540e+09   
4             3.0              2.0        2201.0  1004070.0  1.003350e+09   

                                                 NTA        lat        lon  
0  Lower East Side                               ...  40.713764 -73.985260  
1  Lower East Side                               ...  40.712332 -73.984797  
2  East Village                                  ...  40.729783 -73.983041  
3  Chinatown                                     ...  40.720569 -73.985673  
4  Lower East Side                               ...  40.718725 -73.979426  

[5 rows x 165 columns]

Out[39]:

(363, 165)

clean the combined data¶

In [40]:

#fillna will fill with mean first and still missing value will be filled with 0
means = combined.mean()
combined = combined.fillna(means)
combined = combined.fillna(0)
combined.head()

Out[40]:

	DBN	SCHOOL NAME	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	sat_score	SchoolName	AP Test Takers	Total Exams Taken	...	Location 1	Community Board	Council District	Census Tract	BIN	BBL	NTA	lat	lon
0	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES	29	355.0	404.0	363.0	1122.0	0	107.419512	131.425	...	220 Henry Street\nNew York, NY 10002\n(40.7137...	3.0	1.0	201.0	1003223.0	1.002690e+09	Lower East Side ...	40.713764	-73.985260
1	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	91	383.0	423.0	366.0	1172.0	UNIVERSITY NEIGHBORHOOD H.S.	39.000000	49.000	...	200 Monroe Street\nNew York, NY 10002\n(40.712...	3.0	1.0	202.0	1003214.0	1.002590e+09	Lower East Side ...	40.712332	-73.984797
2	01M450	EAST SIDE COMMUNITY SCHOOL	70	377.0	402.0	370.0	1149.0	EAST SIDE COMMUNITY HS	19.000000	21.000	...	420 East 12 Street\nNew York, NY 10009\n(40.72...	3.0	2.0	34.0	1005974.0	1.004390e+09	East Village ...	40.729783	-73.983041
3	01M509	MARTA VALLE HIGH SCHOOL	44	390.0	433.0	384.0	1207.0	0	107.419512	131.425	...	145 Stanton Street\nNew York, NY 10002\n(40.72...	3.0	1.0	3001.0	1004323.0	1.003540e+09	Chinatown ...	40.720569	-73.985673
4	01M539	NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...	159	522.0	574.0	525.0	1621.0	NEW EXPLORATIONS SCI,TECH,MATH	255.000000	377.000	...	111 Columbia Street\nNew York, NY 10002\n(40.7...	3.0	2.0	2201.0	1004070.0	1.003350e+09	Lower East Side ...	40.718725	-73.979426

5 rows × 165 columns

In [41]:

#extract the first two char in DBN
def ext2(string):
    return string[:2]
combined["school_dist"] = combined["DBN"].apply(ext2)
combined["school_dist"].head()

Out[41]:

0    01
1    01
2    01
3    01
4    01
Name: school_dist, dtype: object

In [42]:

#find correlations between variables and "sat_score"
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)

SAT Critical Reading Avg. Score         0.986820
SAT Math Avg. Score                     0.972643
SAT Writing Avg. Score                  0.987771
sat_score                               1.000000
AP Test Takers                          0.412667
Total Exams Taken                       0.388975
Number of Exams with scores 3 4 or 5    0.382931
Total Cohort                            0.325144
CSD                                     0.042948
NUMBER OF STUDENTS / SEATS FILLED       0.394626
NUMBER OF SECTIONS                      0.362673
AVERAGE CLASS SIZE                      0.381014
SIZE OF SMALLEST CLASS                  0.249949
SIZE OF LARGEST CLASS                   0.314434
SCHOOLWIDE PUPIL-TEACHER RATIO               NaN
schoolyear                                   NaN
fl_percent                                   NaN
frl_percent                            -0.722225
total_enrollment                        0.367857
ell_num                                -0.153778
ell_percent                            -0.398750
sped_num                                0.034933
sped_percent                           -0.448170
asian_num                               0.475445
asian_per                               0.570730
black_num                               0.027979
black_per                              -0.284139
hispanic_num                            0.025744
hispanic_per                           -0.396985
white_num                               0.449559
                                          ...   
aca_p_11                                0.035155
saf_t_11                                0.313810
com_t_11                                0.082419
eng_t_11                                0.036906
aca_t_11                                0.132348
saf_s_11                                0.337639
com_s_11                                0.187370
eng_s_11                                0.213822
aca_s_11                                0.339435
saf_tot_11                              0.318753
com_tot_11                              0.077310
eng_tot_11                              0.100102
aca_tot_11                              0.190966
grade_span_min                         -0.021315
grade_span_max                               NaN
expgrade_span_min                            NaN
expgrade_span_max                            NaN
postcode                               -0.063977
total_students                          0.407827
number_programs                         0.117012
priority08                                   NaN
priority09                                   NaN
priority10                                   NaN
Community Board                        -0.060919
Council District                       -0.076151
Census Tract                            0.048737
BIN                                     0.052232
BBL                                     0.044427
lat                                    -0.121029
lon                                    -0.132222
Name: sat_score, Length: 74, dtype: float64

In [46]:

#look at the high correlation between enrollment and sat_score with scatterplot
import matplotlib.pyplot as plt
%matplotlib inline
combined.plot(x = "total_enrollment", y = "sat_score", kind = "scatter")

Out[46]:

<matplotlib.axes._subplots.AxesSubplot at 0x117b4bcc0>

In [47]:

#as it seems the high correlation of enrollment rate and sat score is caused by the ones in the left lower corner:
low_enrollment = combined[combined["total_enrollment"] < 1000]
low_enrollment = low_enrollment[low_enrollment["sat_score"] < 1000]
print(low_enrollment["School Name"])

91       INTERNATIONAL COMMUNITY HIGH SCHOOL
125                                        0
126          BRONX INTERNATIONAL HIGH SCHOOL
139    KINGSBRIDGE INTERNATIONAL HIGH SCHOOL
141    INTERNATIONAL SCHOOL FOR LIBERAL ARTS
176                                        0
179            HIGH SCHOOL OF WORLD CULTURES
188       BROOKLYN INTERNATIONAL HIGH SCHOOL
225    INTERNATIONAL HIGH SCHOOL AT PROSPECT
237               IT TAKES A VILLAGE ACADEMY
253                MULTICULTURAL HIGH SCHOOL
286    PAN AMERICAN INTERNATIONAL HIGH SCHOO
Name: School Name, dtype: object

with some research on the internet reveals these schools target the children of new immigrants, whose unfamiliarity of the English language could be a major barrier preventing them to get higher sat scores

In [48]:

#it shows higher percentage of english language learner might correlate with the low SAT score, to look into it:
combined.plot(x = "ell_percent", y = "sat_score", kind = "scatter")

Out[48]:

<matplotlib.axes._subplots.AxesSubplot at 0x117bdada0>

this scatterplot further supports our hypothesis that the language barrier among English Language Learner could make them lag behind in the tests

In [52]:

#focus on the nyc by setting boundaries of lon and lat
from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i')

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = combined["lon"].tolist()
latitudes = combined["lat"].tolist()
#zorder=2 indicates the spots will be ontop of maps, latlon indicates instead of x, y, the lon and lat will be passed into the function
#cmap indicates the color scheme, c = df[col] meaning the color scheme will be rendered on this var
m.scatter(x = longitudes, y = latitudes, s = 20, zorder = 2, latlon = True, c = combined["ell_percent"], cmap = "summer")
plt.show()

/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:1698: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  limb = ax.axesPatch
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3222: MatplotlibDeprecationWarning: The ishold function was deprecated in version 2.0.
  b = ax.ishold()
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3231: MatplotlibDeprecationWarning: axes.hold is deprecated.
    See the API Changes document (http://matplotlib.org/api/api_changes.html)
    for more details.
  ax.hold(b)

In [69]:

#group schools by district
group_dist = combined.groupby("school_dist", as_index = False)
districts = group_dist.agg(numpy.mean)
districts.head(10)

Out[69]:

	school_dist	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	sat_score	AP Test Takers	Total Exams Taken	Number of Exams with scores 3 4 or 5	Total Cohort	CSD	...	Community Board	Council District	Census Tract	BIN	BBL	lat	lon
0	01	441.833333	473.333333	439.333333	1354.500000	105.876423	140.212500	101.427007	93.500000	1.0	...	3.000000	1.500000	1106.833333	1.004144e+06	1.003302e+09	40.719022	-73.982377
1	02	426.619092	444.186256	424.832836	1295.638184	90.233943	116.819792	78.672141	158.647849	2.0	...	4.204005	2.963296	1132.479744	1.103862e+06	1.070438e+09	40.739699	-73.991386
2	03	428.529851	437.997512	426.915672	1293.443035	143.578049	114.200000	154.417883	183.384409	3.0	...	7.500000	6.500000	166.666667	1.034931e+06	1.012833e+09	40.781574	-73.977370
3	04	402.142857	416.285714	405.714286	1224.142857	116.668293	146.385714	114.207508	113.857143	4.0	...	11.000000	8.000000	2637.000000	1.055874e+06	1.016681e+09	40.793449	-73.943215
4	05	427.159915	438.236674	419.666098	1285.062687	79.548432	96.978571	105.636079	143.677419	5.0	...	9.428571	8.142857	9372.571429	1.063080e+06	1.019721e+09	40.817077	-73.949251
5	06	382.011940	400.565672	382.066269	1164.643881	100.067805	133.470000	79.645255	180.848387	6.0	...	11.179224	10.823823	2765.702770	1.226037e+06	1.170781e+09	40.848970	-73.932502
6	07	376.461538	380.461538	371.923077	1128.846154	65.392120	87.240385	73.548007	105.605459	7.0	...	1.692308	12.846154	58.538462	2.032562e+06	2.024050e+09	40.816815	-73.919971
7	08	386.214383	395.542741	377.908005	1159.665129	106.592461	132.231818	102.546782	215.510264	8.0	...	7.272727	16.727273	67.909091	2.037043e+06	2.036723e+09	40.823803	-73.866087
8	09	373.755970	383.582836	374.633134	1131.971940	62.767805	78.020000	67.534307	113.330645	9.0	...	3.550000	15.750000	5243.750000	2.022457e+06	2.027913e+09	40.836349	-73.906240
9	10	403.363636	418.000000	400.863636	1222.227273	80.076275	113.259091	78.000664	161.318182	10.0	...	7.000000	12.227273	1686.045455	1.856268e+06	1.850558e+09	40.870345	-73.898360

10 rows × 75 columns

In [54]:

#scatter plot with school district instead of individual school
from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i')

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = districts["lon"].tolist()
latitudes = districts["lat"].tolist()

m.scatter(longitudes, latitudes, s = 50, zorder = 2, latlon = True, c = districts["ell_percent"], cmap = "summer")
plt.show()

/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:1698: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  limb = ax.axesPatch
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3222: MatplotlibDeprecationWarning: The ishold function was deprecated in version 2.0.
  b = ax.ishold()
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3231: MatplotlibDeprecationWarning: axes.hold is deprecated.
    See the API Changes document (http://matplotlib.org/api/api_changes.html)
    for more details.
  ax.hold(b)

In [55]:

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_10", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]

In [62]:

sortedcorr = correlations[survey_fields].sort_values(ascending = False)

In [64]:

sortedcorr.plot(kind = 'barh', figsize = (12, 6))

Out[64]:

<matplotlib.axes._subplots.AxesSubplot at 0x12166c8d0>

Findings¶

It appears the rate of response of parents are negatively related with the SAT score while student's positive response and response rate is highly correlated with the SAT score. Also teacher's response is positively correlated.
Including safe and respect, communication quality all contribute to a higher SAT score.

In [65]:

combined.plot("saf_s_11", "sat_score", kind = 'scatter')

Out[65]:

<matplotlib.axes._subplots.AxesSubplot at 0x121667cc0>

when safety score is below 6.5, it doesn't seem correlated with sat score much.
when safety score goes up to 7 and to about 8, it shows a positive correlation with SAT score, however, with the further going in the safety score, the effect disppears.

In [72]:

from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i')

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
m.fillcontinents(color='white',lake_color='#85A6D9')
m.scatter(longitudes, latitudes, s = 50, zorder = 2, latlon = True, c = districts["saf_s_11"], cmap = 'summer')

/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:1698: MatplotlibDeprecationWarning: The axesPatch function was deprecated in version 2.1. Use Axes.patch instead.
  limb = ax.axesPatch
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:1767: MatplotlibDeprecationWarning: The get_axis_bgcolor function was deprecated in version 2.0. Use get_facecolor instead.
  axisbgc = ax.get_axis_bgcolor()
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3222: MatplotlibDeprecationWarning: The ishold function was deprecated in version 2.0.
  b = ax.ishold()
/Applications/anaconda3/lib/python3.6/site-packages/mpl_toolkits/basemap/__init__.py:3231: MatplotlibDeprecationWarning: axes.hold is deprecated.
    See the API Changes document (http://matplotlib.org/api/api_changes.html)
    for more details.
  ax.hold(b)

Out[72]:

<matplotlib.collections.PathCollection at 0x121aed0b8>

It appears the schools in the lower manhattan and north brooklyn has a higher response in safety
the safety isn't so good in the mid-brooklyn area

In [74]:

race = ["white_per", "asian_per", "black_per", "hispanic_per"]
sorted_race = correlations[race].sort_values()
sorted_race.plot(kind = 'barh')

Out[74]:

<matplotlib.axes._subplots.AxesSubplot at 0x1179bfc18>

As all of the correlations > abs[0.25], correlations appear to be pretty strong for all 4 races. However, white and asian races are positively correlated while being black and hispanic certainly have a strong negative effects.

In [76]:

combined.plot.scatter("hispanic_per", "sat_score")

Out[76]:

<matplotlib.axes._subplots.AxesSubplot at 0x121e389e8>

It is pretty obvious hispanic percentage is negatively correlated with the SAT score, when the percentage of Latino goes up to 25% or higher, there is no score above 1500.

In [79]:

combined.head()

Out[79]:

	DBN	SCHOOL NAME	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	sat_score	SchoolName	AP Test Takers	Total Exams Taken	...	Location 1	Community Board	Council District	Census Tract	BIN	BBL	NTA	lat	lon	school_dist
0	01M292	HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES	29	355.0	404.0	363.0	1122.0	0	107.419512	131.425	...	220 Henry Street\nNew York, NY 10002\n(40.7137...	3.0	1.0	201.0	1003223.0	1.002690e+09	Lower East Side ...	40.713764	-73.985260	01
1	01M448	UNIVERSITY NEIGHBORHOOD HIGH SCHOOL	91	383.0	423.0	366.0	1172.0	UNIVERSITY NEIGHBORHOOD H.S.	39.000000	49.000	...	200 Monroe Street\nNew York, NY 10002\n(40.712...	3.0	1.0	202.0	1003214.0	1.002590e+09	Lower East Side ...	40.712332	-73.984797	01
2	01M450	EAST SIDE COMMUNITY SCHOOL	70	377.0	402.0	370.0	1149.0	EAST SIDE COMMUNITY HS	19.000000	21.000	...	420 East 12 Street\nNew York, NY 10009\n(40.72...	3.0	2.0	34.0	1005974.0	1.004390e+09	East Village ...	40.729783	-73.983041	01
3	01M509	MARTA VALLE HIGH SCHOOL	44	390.0	433.0	384.0	1207.0	0	107.419512	131.425	...	145 Stanton Street\nNew York, NY 10002\n(40.72...	3.0	1.0	3001.0	1004323.0	1.003540e+09	Chinatown ...	40.720569	-73.985673	01
4	01M539	NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...	159	522.0	574.0	525.0	1621.0	NEW EXPLORATIONS SCI,TECH,MATH	255.000000	377.000	...	111 Columbia Street\nNew York, NY 10002\n(40.7...	3.0	2.0	2201.0	1004070.0	1.003350e+09	Lower East Side ...	40.718725	-73.979426	01

5 rows × 166 columns

In [80]:

combined["SCHOOL NAME"][combined["hispanic_per"] > 95]

Out[80]:

44                         MANHATTAN BRIDGES HIGH SCHOOL
82      WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL
89     GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...
125                  ACADEMY FOR LANGUAGE AND TECHNOLOGY
141                INTERNATIONAL SCHOOL FOR LIBERAL ARTS
176     PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE
253                            MULTICULTURAL HIGH SCHOOL
286               PAN AMERICAN INTERNATIONAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

Again, most schools here are international schools for immigrants, which might explain the low score

In [85]:

print(combined["SCHOOL NAME"][(combined["hispanic_per"] < 10) & (combined["sat_score"] > 1800)])
print(combined["SCHOOL NAME"][combined["sat_score"] > 1800])

37                                STUYVESANT HIGH SCHOOL
151                         BRONX HIGH SCHOOL OF SCIENCE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object
5                         BARD HIGH SCHOOL EARLY COLLEGE
37                                STUYVESANT HIGH SCHOOL
79     HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...
151                         BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
302                          TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

Most of these high schools are "specialized school" for talented students and only admit the ones who pass the entrance exam. It explains the high sat_scores, while taking off the "hispanic percentage < 10" condition, still 5 school are the same. Maybe hispanic tends to do worse on the standard exams, or they are ususally newer immigrants who have problems with the language thus not admitted in many quality schools.

In [92]:

sex = ["male_per", "female_per"]
correlations[sex].plot(kind = 'barh')

Out[92]:

<matplotlib.axes._subplots.AxesSubplot at 0x114b2c550>

although not very strong, male percentage does have a negative correlation while female percentage have a positive correlation

In [93]:

combined.plot.scatter("female_per", "sat_score")

Out[93]:

<matplotlib.axes._subplots.AxesSubplot at 0x121c51278>

it's fairly interesting that too high or too low the female percentage would both have very negative effect on the SAT score: lower than 30% or higher than 80% female percentage cause no school have sat score higher than 1400. However, from 40%~60%, the increase in female percentage seems having a positive effect.

In [94]:

combined[(combined["female_per"] > 60) & (combined["sat_score"] > 1700)]

Out[94]:

	DBN	SCHOOL NAME	Num of SAT Test Takers	SAT Critical Reading Avg. Score	SAT Math Avg. Score	SAT Writing Avg. Score	sat_score	SchoolName	AP Test Takers	Total Exams Taken	...	Location 1	Community Board	Council District	Census Tract	BIN	BBL	NTA	lat	lon	school_dist
5	01M696	BARD HIGH SCHOOL EARLY COLLEGE	130	624.0	604.0	628.0	1856.0	0	107.419512	131.425	...	525 East Houston Street\nNew York, NY 10002\n(...	3.0	2.0	1002.0	1004062.0	1.003250e+09	Lower East Side ...	40.718962	-73.976066	01
26	02M416	ELEANOR ROOSEVELT HIGH SCHOOL	127	572.0	594.0	592.0	1758.0	Eleanor Roosevelt High School	155.000000	235.000	...	411 East 76 Street\nNew York, NY 10021\n(40.77...	8.0	5.0	132.0	1045949.0	1.014710e+09	Lenox Hill-Roosevelt Island ...	40.770116	-73.953379	02
60	03M479	BEACON HIGH SCHOOL	261	577.0	575.0	592.0	1744.0	BEACON SCHOOL	166.000000	197.000	...	227 243 West 61St Street\nNew York, NY 10023\n...	7.0	6.0	151.0	1030328.0	1.011540e+09	Lincoln Square ...	40.772158	-73.987797	03
61	03M485	FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...	531	566.0	564.0	577.0	1707.0	FIORELLO H.LAGUARDIA HS	691.000000	131.425	...	100 Amsterdam Avenue\nNew York, NY 10023\n(40....	7.0	6.0	151.0	1030341.0	1.011560e+09	Lincoln Square ...	40.773671	-73.985269	03
302	25Q525	TOWNSEND HARRIS HIGH SCHOOL	278	621.0	651.0	638.0	1910.0	TOWNSEND HARRIS HS	613.000000	796.000	...	149 11 Melbourne Avenue\nFlushing, NY 11367\n(...	8.0	24.0	809.0	4538714.0	4.065070e+09	Kew Gardens Hills ...	40.734408	-73.821417	25

5 rows × 166 columns

these schools boost strong AP test preparation, might be a strong indication of why students are performing better in SAT

In [96]:

combined["ap_per"] = combined["AP Test Takers "]/combined["total_enrollment"]

In [98]:

combined.plot.scatter('ap_per', 'sat_score')

Out[98]:

<matplotlib.axes._subplots.AxesSubplot at 0x1221aff98>

In [103]:

combined.corr()["ap_per"]["sat_score"]

Out[103]:

0.026359988554234652

correlation between ap_per and sat score is fairly low with r score < 0.05. however, when looking at the scatter plot, it appears before ap percentage reaches 20%, there is a positive trend between these two. On the other hand, when ap percentage above 40%, concentrating on AP does not only not helping, but also has a very negative effect on the turnouts. when ap percentage is above 40, there is no score above 1200.

Phoebe's blog

NYC School exploratory analysis: Which schools have high/low SAT score?

data cleaning: make the key across datasets consistent¶

Combine datasets¶

clean the combined data¶

Findings¶

data cleaning: make the key across datasets consistent¶

Combine datasets¶

clean the combined data¶

Findings¶

social