#importing libraries
from urllib.request import urlretrieve
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, roc_curve, plot_confusion_matrix, f1_score, roc_auc_score


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,f1_score
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

plt.style.use("ggplot")

from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import re
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy


## Specify the url: url

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"


# Save file locally
urlretrieve(url, "data_voting.csv")
# Read file into a DataFrame and print its head
df = pd.read_csv('data_voting.csv', sep=',')
# assign names to columns:
name_features= ['party', 'infants', 'water', 'budget', 'physician', 'salvador',
       'religious', 'satellite', 'aid', 'missile', 'immigration', 'synfuels',
       'education', 'superfund', 'crime', 'duty_free_exports', 'eaa_rsa']
df.columns = name_features


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   party              434 non-null    object
 1   infants            434 non-null    object
 2   water              434 non-null    object
 3   budget             434 non-null    object
 4   physician          434 non-null    object
 5   salvador           434 non-null    object
 6   religious          434 non-null    object
 7   satellite          434 non-null    object
 8   aid                434 non-null    object
 9   missile            434 non-null    object
 10  immigration        434 non-null    object
 11  synfuels           434 non-null    object
 12  education          434 non-null    object
 13  superfund          434 non-null    object
 14  crime              434 non-null    object
 15  duty_free_exports  434 non-null    object
 16  eaa_rsa            434 non-null    object
dtypes: object(17)
memory usage: 57.8+ KB


df.head()


df.party.value_counts(normalize=True).round(2)

democrat      0.62
republican    0.38
Name: party, dtype: float64


#change type to categorical
for x in df.columns:
        df[x] = df[x].astype("category")


#how many "?" values we have ?
df[df == '?'].count()

party                  0
infants               12
water                 48
budget                11
physician             11
salvador              15
religious             11
satellite             14
aid                   15
missile               22
immigration            7
synfuels              20
education             31
superfund             25
crime                 17
duty_free_exports     28
eaa_rsa              104
dtype: int64


df[df == "?"] = np.nan


df.head()


#step 1:
       #  Setup the Imputation transformer: imp
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

#step 2:
       # setup a one hot enconder to transform data into binary
encoder = OneHotEncoder(drop="if_binary")

# performance a Pipeline
       # Setup the pipeline with the required steps: steps
steps = [('imputation', imp),
       ('encoder', encoder)]

#create a pipeline
pipeline = Pipeline(steps)

#keep transform values
data = pipeline.fit_transform(df).toarray()

#transforme data to dataframe
data = pd.DataFrame(data,columns=name_features)


data


y =data["party"].values

X = data.drop(["party"],axis=1).values


# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, shuffle=True,stratify=y)


#create model rfc1
rfc1 = RandomForestClassifier(random_state=42)

rfc1.fit(X_train,y_train)

RandomForestClassifier(random_state=42)


# Predict the labels of the test set: y_pred1
y_pred1 = rfc1.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(rfc1.score(X_test, y_test)))
print(classification_report(y_test, y_pred1))

Accuracy: 0.9694656488549618
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98        81
         1.0       0.98      0.94      0.96        50

    accuracy                           0.97       131
   macro avg       0.97      0.96      0.97       131
weighted avg       0.97      0.97      0.97       131


report1 = classification_report(y_test, y_pred1 ,output_dict=True)
report1 = pd.DataFrame(report1).transpose()
#keep confusion matrix
cf_matrix1 = confusion_matrix(y_test, y_pred1)
#keep acurracy
acu1 = accuracy_score(y_test, y_pred1)
#keep AUC
auc1 = roc_auc_score(y_test, y_pred1)
f1_1 = f1_score(y_test, y_pred1, average="macro" )


report1


print("f1 macro score: ",f1_1)

f1 macro score:  0.9673967147834743


sns.set_context("talk")
plt.style.use('ggplot')

fig, axes = plt.subplots(1,2,figsize=(10,5),alpha=0.5)
group_names = ['True Neg','False Pos','False Neg','True Pos']

######## plot confidence matrix

   
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix1.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix1.flatten()/np.sum(cf_matrix1)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix1, annot=labels, fmt='', cmap='Oranges',ax=axes[0])
axes[0].set_title('Confusion matrix (RandomForest mod) \n\n' + "accuracy score: {0:.2%}".format(acu1));
axes[0].set_xlabel('\nPredicted Values')
axes[0].set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
axes[0].xaxis.set_ticklabels(['False','True'])
axes[0].yaxis.set_ticklabels(['False','True'])

#plot AUC curve

y_pred_prob = rfc1.predict_proba(X_test)[:,1]
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_prob)
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].plot(fpr1, tpr1, label='Random forest classifier')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('RandomForest model  \n\n'+ "AUC : {0:.2%}".format(auc1))


plt.tight_layout()
plt.show()


rfc = RandomForestClassifier(random_state=42)

# Specify the hyperparameter
parameters = { 
    'n_estimators': [50,100, 200, 400, 600],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2, 4, 6, 8, 10],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [12,20,40],
    'criterion' :['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Inicial the GridSearchCV object: cv
cv = GridSearchCV(rfc,param_grid=parameters,cv=5,scoring="f1_macro")


# Fit to the training set

cv.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [12, 20, 40],
                         'n_estimators': [50, 100, 200, 400, 600]},
             scoring='f1_macro')


# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 0.9756460308607547
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98        81
         1.0       0.98      0.96      0.97        50

    accuracy                           0.98       131
   macro avg       0.98      0.97      0.98       131
weighted avg       0.98      0.98      0.98       131

Tuned Model Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 50}


#keep scrore
report = classification_report(y_test, y_pred ,output_dict=True)
report = pd.DataFrame(report).transpose()
#keep confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
#keep acurracy
acu = accuracy_score(y_test, y_pred)
#keep AUC
auc = roc_auc_score(y_test, y_pred)
f1_macro= f1_score(y_test, y_pred, average="macro" )


report


print("f1 macro score before gridsearch: ",f1_1)
print("f1 macro score after gridsearch: ",f1_macro)

f1 macro score before gridsearch:  0.9673967147834743
f1 macro score after gridsearch:  0.9756460308607547


sns.set_context("talk")
plt.style.use('ggplot')

fig, axes = plt.subplots(2,2,figsize=(12,10),alpha=0.5)
group_names = ['True Neg','False Pos','False Neg','True Pos']

######## plot confidence matrix

    #mod 

group_counts = ["{0:0.0f}".format(value) for value in cf_matrix1.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix1.flatten()/np.sum(cf_matrix1)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix1, annot=labels, fmt='', cmap='Oranges',ax=axes[0,0])
axes[0,0].set_title('Confusion matrix (RandomForest mod) \n\n' + "accuracy score: {0:.2%}".format(acu1));
axes[0,0].set_xlabel('\nPredicted Values')
axes[0,0].set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
axes[0,0].xaxis.set_ticklabels(['False','True'])
axes[0,0].yaxis.set_ticklabels(['False','True'])


   #mod after hyperparametre tuning
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Greens',ax=axes[1,0])
axes[1,0].set_title('Confusion matrix (RandomForest mod after hyperparametre tuning) \n\n' + "accuracy score: {0:.2%}".format(acu));
axes[1,0].set_xlabel('\nPredicted Values')
axes[1,0].set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
axes[1,0].xaxis.set_ticklabels(['False','True'])
axes[1,0].yaxis.set_ticklabels(['False','True'])

#plot AUC curve
    #mod 1

y_pred_prob = rfc1.predict_proba(X_test)[:,1]
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_prob)
axes[0,1].plot([0, 1], [0, 1], 'k--')
axes[0,1].plot(fpr1, tpr1, label='Random forest classifier')
axes[0,1].set_xlabel('False Positive Rate')
axes[0,1].set_ylabel('True Positive Rate')
axes[0,1].set_title('RandomForest model  \n\n'+ "AUC : {0:.2%}".format(auc1))





    #mod after hyperparametre tuning
y_pred_prob = cv.predict_proba(X_test)[:,1]
fpr1, tpr1, thresholds1 = roc_curve(y_test, y_pred_prob)
axes[1,1].plot([0, 1], [0, 1], 'k--')
axes[1,1].plot(fpr1, tpr1, label='RandomForest classifier')
axes[1,1].set_xlabel('False Positive Rate')
axes[1,1].set_ylabel('True Positive Rate')
axes[1,1].set_title('RandomForest mod after hyperparametre tuning  \n\n'+ "AUC : {0:.2%}".format(auc))


plt.tight_layout()
plt.show()


#define the url 
url_text = "https://www.nytimes.com/2022/05/16/opinion/buffalo-shooting-replacement-theory.html"


s = requests.Session()
#skip the filters from the web
s.cookies["cf_clearance"] = "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
s.headers.update({
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        })
s = s.get(url_text)

# Extracts the response as html: html_doc
hmtl_doc = s.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(hmtl_doc)
# Get the title of article: article_title
article_title = soup.title
# Get article text: article_text
article_text = soup.get_text()


# Print article title to the shell
article_title.get_text()

'Opinion | The Buffalo Shooting Was Not a Random Act of Violence - The New York Times'


# Print article text to the shell
print(article_text)



Opinion | The Buffalo Shooting Was Not a Random Act of Violence - The New York Times
  

SectionsSEARCHSkip to contentSkip to site indexLog inToday’s PaperOpinion|The Buffalo Shooting Was Not a Random Act of Violencehttps://www.nytimes.com/2022/05/16/opinion/buffalo-shooting-replacement-theory.htmlGive this article1219AdvertisementContinue reading the main storyOpinionSupported byContinue reading the main storyThe Editorial BoardThe Buffalo Shooting Was Not a Random Act of ViolenceMay 16, 2022Credit...Illustration by Rebecca Chew/The New York Times; photograph by NK08gerd, via Getty ImagesSend any friend a storyAs a subscriber, you have 10 gift articles to give each month. Anyone can read what you share.Give this article1219Read in appBy The Editorial BoardThe editorial board is a group of opinion journalists whose views are informed by expertise, research, debate and certain longstanding values. It is separate from the newsroom.Republican politicians, including some of the party’s top leaders, openly espouse versions of a white supremacist conspiracy theory holding that an orchestrated effort is underway to displace white Americans. A recently published poll found that almost half of Republicans believe that immigrants are being brought to the United States as part of such an effort.On Saturday, a gunman who said he was motivated by a version of this “replacement theory” killed 10 people at a Buffalo grocery store, officials said. The suspect, identified as Payton S. Gendron, wrote in an online diatribe that he sought to kill Black people because he wanted to prevent white people from losing their rightful control of the country.Mr. Gendron described himself as part of a movement. He said that he was inspired by similar attacks on other minority communities and that he hoped others would follow his example. The suspects in several mass killings in recent years, including the 2015 murder of nine Black worshipers at a church in Charleston, S.C.; the 2018 murder of 11 Jewish worshipers at a synagogue in Pittsburgh; the 2019 murder of 51 Muslim worshipers at a pair of mosques in New Zealand; and the 2019 murder of 23 people, many Latino, in El Paso also propounded versions of this racist worldview.American life is punctuated by mass shootings that are routinely described as idiosyncratic. But these attacks are not random acts; they are part of the long American history of political violence perpetrated by white supremacists against Black people and other minority groups.Politicians who have employed some of the vocabulary of replacement theory generally do not make explicit calls for violence. The office of one of those politicians, Representative Elise Stefanik of New York, said in a statement that the Buffalo attack was an “act of evil” and that she “has never advocated for any racist position.”The matter is not so simple.Replacement theory is an attack on democracy. It privileges the purported interests of some Americans over those of others, asserting, in effect, that the will of the people means the will of white people. It rekindles fears and resentments among white Americans that cynical practitioners of American politics have stoked throughout the nation’s history. It also provides a disturbing rationalization for people inclined to resort to violence when the political process does not deliver what they want or protect what they see as their place in society.The Fox News host Tucker Carlson, a leading purveyor of replacement theory rhetoric, has promoted the idea that elites are seeking to replace white Americans on more than 400 episodes of his program, according to an analysis by The New York Times.“Now I know that the left and all the little gatekeepers on Twitter become literally hysterical if you use the term ‘replacement,’ if you suggest the Democratic Party is trying to replace the current electorate, the voters now casting ballots, with new people, more obedient voters from the third world,” Mr. Carlson said on an episode in April 2021. “But they become hysterical because that’s — that’s what’s happening, actually.” Representative Matt Gaetz, a Florida Republican, later tweeted that Mr. Carlson “is CORRECT about Replacement Theory as he explains what is happening to America.”In September, Ms. Stefanik’s re-election campaign paid for a Facebook ad that combined imagery of immigrants with the accusation that “Radical Democrats are planning their most aggressive move yet: a PERMANENT ELECTION INSURRECTION.” Ms. Stefanik’s ad continued, “Their plan to grant amnesty to 11 MILLION illegal immigrants will overthrow our current electorate and create a permanent liberal majority in Washington.”Right-wing rhetoricians in the United States portray undocumented immigrants as the primary threat. This sanitizes replacement theory for mainstream consumption without diluting its logic. The same argument is easily applied to other minority groups.The French author Renaud Camus coined the term “the great replacement” in a 2011 book to describe what he saw as a conscious effort by French elites to open the country’s doors for Muslim immigrants to replace the ethnically French population and culture.The template has been adapted for use by extremists around the world. Mr. Gendron wrote that he blamed Jews for orchestrating the replacement of white Americans. He copied large portions of his manifesto from the document posted to justify the New Zealand killings, in some cases inserting the name of the Jewish philanthropist George Soros in place of the former German chancellor Angela Merkel’s name. The manifesto posted by the El Paso shooting suspect, which Mr. Gendron also referenced, spoke of the “Hispanic invasion of Texas.” The common thread — the ineluctable core of replacement theory — is that some people are white and some people are not, and the people who are white are threatened by those who are not.It must also be emphasized that the United States makes it easy for domestic terrorists to kill. The police said that the Buffalo assailant used a Bushmaster XM-15 rifle that he had purchased legally at a gun shop near his hometown. As a practical matter, almost anyone can buy guns that are designed to kill a lot of people quickly. The only real line of defense is the judgment of the people who sell guns. “He didn’t stand out — because if he did, I would’ve never sold him the gun,” Robert Donald, the store’s owner, told The New York Times.The focus on the gunman’s motives should not obscure the fact that the most important step the government can take to impede similar attacks is to limit the availability of guns.The health of American democracy also requires the constructive use of free speech, especially by the nation’s political leaders. There are always demagogues whose stock in trade is the demonization of immigrants and other minority groups, and American society has long allowed those on the fringes to air their views. The question in any era is whether such views are voiced, or echoed, by those in positions of responsibility.It is telling that House Republicans last year installed Ms. Stefanik in leadership to replace Representative Liz Cheney of Wyoming, who remains among the most forthright critics of the party’s illiberal turn.Ms. Cheney tweeted on Monday: “The House GOP leadership has enabled white nationalism, white supremacy, and anti-semitism. History has taught us that what begins with words ends in far worse. @GOP leaders must renounce and reject these views and those who hold them.”She’s right.The Times is committed to publishing a diversity of letters to the editor. We’d like to hear what you think about this or any of our articles. Here are some tips. And here’s our email: letters@nytimes.com.Follow The New York Times Opinion section on Facebook, Twitter (@NYTopinion) and Instagram.AdvertisementContinue reading the main storySite Information Navigation© 2022 The New York Times CompanyNYTCoContact UsAccessibilityWork with usAdvertiseT Brand StudioYour Ad ChoicesPrivacy PolicyTerms of ServiceTerms of SaleSite MapCanadaInternationalHelpSubscriptions


#find the start of text
start = article_text.find("Republican politicians, including some of the party’s top")
#find the end of the text
end = article_text.find("@GOP leaders must renounce and reject these views and those who hold them")
# get a clean text
article_text = article_text[start:end]
# print the text
print(article_text)

Republican politicians, including some of the party’s top leaders, openly espouse versions of a white supremacist conspiracy theory holding that an orchestrated effort is underway to displace white Americans. A recently published poll found that almost half of Republicans believe that immigrants are being brought to the United States as part of such an effort.On Saturday, a gunman who said he was motivated by a version of this “replacement theory” killed 10 people at a Buffalo grocery store, officials said. The suspect, identified as Payton S. Gendron, wrote in an online diatribe that he sought to kill Black people because he wanted to prevent white people from losing their rightful control of the country.Mr. Gendron described himself as part of a movement. He said that he was inspired by similar attacks on other minority communities and that he hoped others would follow his example. The suspects in several mass killings in recent years, including the 2015 murder of nine Black worshipers at a church in Charleston, S.C.; the 2018 murder of 11 Jewish worshipers at a synagogue in Pittsburgh; the 2019 murder of 51 Muslim worshipers at a pair of mosques in New Zealand; and the 2019 murder of 23 people, many Latino, in El Paso also propounded versions of this racist worldview.American life is punctuated by mass shootings that are routinely described as idiosyncratic. But these attacks are not random acts; they are part of the long American history of political violence perpetrated by white supremacists against Black people and other minority groups.Politicians who have employed some of the vocabulary of replacement theory generally do not make explicit calls for violence. The office of one of those politicians, Representative Elise Stefanik of New York, said in a statement that the Buffalo attack was an “act of evil” and that she “has never advocated for any racist position.”The matter is not so simple.Replacement theory is an attack on democracy. It privileges the purported interests of some Americans over those of others, asserting, in effect, that the will of the people means the will of white people. It rekindles fears and resentments among white Americans that cynical practitioners of American politics have stoked throughout the nation’s history. It also provides a disturbing rationalization for people inclined to resort to violence when the political process does not deliver what they want or protect what they see as their place in society.The Fox News host Tucker Carlson, a leading purveyor of replacement theory rhetoric, has promoted the idea that elites are seeking to replace white Americans on more than 400 episodes of his program, according to an analysis by The New York Times.“Now I know that the left and all the little gatekeepers on Twitter become literally hysterical if you use the term ‘replacement,’ if you suggest the Democratic Party is trying to replace the current electorate, the voters now casting ballots, with new people, more obedient voters from the third world,” Mr. Carlson said on an episode in April 2021. “But they become hysterical because that’s — that’s what’s happening, actually.” Representative Matt Gaetz, a Florida Republican, later tweeted that Mr. Carlson “is CORRECT about Replacement Theory as he explains what is happening to America.”In September, Ms. Stefanik’s re-election campaign paid for a Facebook ad that combined imagery of immigrants with the accusation that “Radical Democrats are planning their most aggressive move yet: a PERMANENT ELECTION INSURRECTION.” Ms. Stefanik’s ad continued, “Their plan to grant amnesty to 11 MILLION illegal immigrants will overthrow our current electorate and create a permanent liberal majority in Washington.”Right-wing rhetoricians in the United States portray undocumented immigrants as the primary threat. This sanitizes replacement theory for mainstream consumption without diluting its logic. The same argument is easily applied to other minority groups.The French author Renaud Camus coined the term “the great replacement” in a 2011 book to describe what he saw as a conscious effort by French elites to open the country’s doors for Muslim immigrants to replace the ethnically French population and culture.The template has been adapted for use by extremists around the world. Mr. Gendron wrote that he blamed Jews for orchestrating the replacement of white Americans. He copied large portions of his manifesto from the document posted to justify the New Zealand killings, in some cases inserting the name of the Jewish philanthropist George Soros in place of the former German chancellor Angela Merkel’s name. The manifesto posted by the El Paso shooting suspect, which Mr. Gendron also referenced, spoke of the “Hispanic invasion of Texas.” The common thread — the ineluctable core of replacement theory — is that some people are white and some people are not, and the people who are white are threatened by those who are not.It must also be emphasized that the United States makes it easy for domestic terrorists to kill. The police said that the Buffalo assailant used a Bushmaster XM-15 rifle that he had purchased legally at a gun shop near his hometown. As a practical matter, almost anyone can buy guns that are designed to kill a lot of people quickly. The only real line of defense is the judgment of the people who sell guns. “He didn’t stand out — because if he did, I would’ve never sold him the gun,” Robert Donald, the store’s owner, told The New York Times.The focus on the gunman’s motives should not obscure the fact that the most important step the government can take to impede similar attacks is to limit the availability of guns.The health of American democracy also requires the constructive use of free speech, especially by the nation’s political leaders. There are always demagogues whose stock in trade is the demonization of immigrants and other minority groups, and American society has long allowed those on the fringes to air their views. The question in any era is whether such views are voiced, or echoed, by those in positions of responsibility.It is telling that House Republicans last year installed Ms. Stefanik in leadership to replace Representative Liz Cheney of Wyoming, who remains among the most forthright critics of the party’s illiberal turn.Ms. Cheney tweeted on Monday: “The House GOP leadership has enabled white nationalism, white supremacy, and anti-semitism. History has taught us that what begins with words ends in far worse.


# fuctions we are going to use in this project
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = text.replace("—","")
    text = re.sub('\[.*?¿\]\%-', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
#
def clean_text2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…«»]', '', text)
    text = re.sub('\n', ' ', text)
    return text

def wordListToFreqDict(wordlist):
    """ create a list of words in a dictionary of words frequency"""
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(list(zip(wordlist,wordfreq)))

def show_values(axs, orient="v", space=.01):
    """ shows the value of the features in the top of the bar plot """
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.0f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.0f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)

def clean_stop_words(text):
    """remove stopwords from a given text"""
    # define stopwords
    stop_words=set(stopwords.words("english"))
    #removing stopwords
    filtered_sent=[]
    for w in text :
        if w not in stop_words:
            filtered_sent.append(w)
    return filtered_sent

def createStemmer(listOfWords):
    """create a list of stemmer words from a 
    list of words"""
    ps = PorterStemmer()
    stemmerlist = []  
    for w in listOfWords:
        stemmerlist.append(ps.stem(w))
    return stemmerlist

def sentiment_texblob(ind,text):
    """return a polarity and subjertivity 
    from a given text using TexBlob library
    ind = 0 for polarity
    ind = 1 subjetivity"""
    tex = TextBlob(text)
    result = tex.sentiment
    
    return list(result)[ind]

def get_sia(text):
    """given a text, return a VADER score"""
    sia = SentimentIntensityAnalyzer()
    sia_score = sia.polarity_scores(text)
    return sia_score


# Split article_text into sentences: sentences
sentences = sent_tokenize(article_text)


#keep sentence into dataframe
data_text = pd.DataFrame(sentences,columns=["transcript"])
#show the dataframe
data_text.head()


round1 = lambda x: clean_text(x)
data_text["clean"] = data_text.transcript.apply(round1)
data_text.head()


round2 = lambda x: clean_text2(x)
data_text["clean2"] = pd.DataFrame(data_text.clean.apply(round2))
data_text.head()


unique = lambda x: word_tokenize(x)
data_text["unique_token"] = pd.DataFrame(data_text.clean2.apply(unique))
data_text.head()


unique = lambda x: wordListToFreqDict(x)
data_text["freq"] = pd.DataFrame(data_text.unique_token.apply(unique))
data_text.head()


raw_text= article_text
# first round of clean
article_text_clean = clean_text(article_text)
# second round of clean
article_text_clean2 = clean_text2(article_text_clean)
# Tokenize all the text
article_text_token = word_tokenize(article_text_clean2)
#create a dict of frequencies of the tokenize list
dic_freq = wordListToFreqDict(article_text_token)
#create a dataframe with the dictionary of words frequency
df_freq = pd.DataFrame(list(dic_freq.items()),columns = ['words','freq'])


df_freq.sort_values(by="freq",ascending=False).head()


#keep 20 most used word in a diferent dataframe
df_freq_to_plot = df_freq.nlargest(n=20, columns=['freq'], keep='all')


#create a plot with the 20 most used word in the article
sns.set_context("talk")
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize =(9,9))
ax = sns.barplot(y="words", x="freq", data=df_freq_to_plot,palette="Blues_r",ax=ax)
ax.bar_label(ax.containers[0])
plt.title("Top 20 most used words\n (in The New York Times editorial (May 16, 2022) )")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.show()


clean_s_w = lambda x : clean_stop_words(x)
data_text["without_sw"] = pd.DataFrame(data_text["unique_token"].apply(clean_s_w))
data_text.head()


unique = lambda x: wordListToFreqDict(x)
data_text["freq2"] = pd.DataFrame(data_text.without_sw.apply(unique))
data_text.head()


#quit the stopwords
art_without_sw = clean_stop_words(article_text_token)
#calculate the frequency of the words (without stopswords)
dic_freq2 = wordListToFreqDict(art_without_sw)
#create a dataframe with the frequency of words
df_freq2 = pd.DataFrame(list(dic_freq2.items()),columns = ['words','freq'])
#show the dataframe
df_freq2.sort_values(by="freq",ascending=False).head()


#keep 20 most used words in a new dataframe
df_freq_to_plot2 = df_freq2.nlargest(n=20, columns=['freq'])


#generate a plot

sns.set_context("talk")
plt.style.use('ggplot')

#plot 1 with stopwords
fig, axes = plt.subplots(1,2,figsize =(20, 10))
ax = sns.barplot(y="words", x="freq", data=df_freq_to_plot,palette="Blues_r",ax=axes[0])
show_values(ax,"h",space=0)
ax.set(title="Top 20 most used words\n (The New York Times editorial (May 16, 2022) )")
ax.tick_params(axis="y",labelsize=12)

#plot 2 after removing stopwords
ax1 = sns.barplot(y="words", x="freq", data=df_freq_to_plot2, palette="Greens_r",ax=axes[1])
ax1.set(title="Top 20 most used words (after removing stopwords)\n (in The New York Times editorial (May 16, 2022) ) ")
ax1.tick_params(axis="y",labelsize=12)
show_values(ax1,"h",space=0)

plt.tight_layout()
plt.show()


#performance stemmer
stemmer = lambda x : createStemmer(x)
#keep result in new column
data_text["stemmer"] = pd.DataFrame(data_text.without_sw.apply(stemmer))


#performace frequency of stemmers
unique = lambda x: wordListToFreqDict(x)
#keep result in new column
data_text["freq_stemmer"] = pd.DataFrame(data_text.stemmer.apply(unique))
data_text.head()


#stemming the text
stemme_text = createStemmer(art_without_sw)
#create frequency of stemmer
freqsteme = wordListToFreqDict(stemme_text)
#create a dataframe with the frequency of stemmerWords
df_freqSteme = pd.DataFrame(list(freqsteme.items()),columns = ['words','freq'])
#show the dataframe
df_freqSteme.sort_values(by="freq",ascending=False).head()


#keep data to plot
df_freq_to_plot3 = df_freqSteme.nlargest(n=20, columns=['freq'])


#generate a plot
sns.set_context("talk")
plt.style.use('ggplot')

fig, axes = plt.subplots(1,3,figsize =(20, 10))

#plot 1 with stopwords

ax = sns.barplot(y="words", x="freq", data=df_freq_to_plot,palette="Blues_r",ax=axes[0])
show_values(ax,"h",space=0)
ax.set(title="Top 20 most used words\n (The New York Times editorial (May 16, 2022) )")
ax.tick_params(axis="y",labelsize=12)

#plot 2 after removing stopwords

ax1 = sns.barplot(y="words", x="freq", data=df_freq_to_plot2, palette="Greens_r",ax=axes[1])
ax1.set(title="Top 20 most used words (after removing stopwords)\n (in The New York Times editorial (May 16, 2022) )")
ax1.tick_params(axis="y",labelsize=12)
show_values(ax1,"h",space=0)

#plot 3 after stemmers

ax2 = sns.barplot(y="words", x="freq", data=df_freq_to_plot3, palette="Oranges_r",ax=axes[2])
ax2.set(title="Top 20 most used words (after removing stopwords & stemming)\n (in The New York Times editorial (May 16, 2022) )")
ax2.tick_params(axis="y",labelsize=12)
show_values(ax2,"h",space=0)

plt.tight_layout()
plt.show()


# Instantiate the English model: nlp
nlp = spacy.load("en_core_web_sm")
# Create a new document: doc
doc = nlp(raw_text)
label = []
entite = []
# Print all of the found entities and their labels
for ent in doc.ents:
    label.append(ent.label_),entite.append(ent.text)

#keep entities in a new dataframe
data_entite = pd.DataFrame(list(zip(label,entite)), columns=["label","entite"])


data_entite.head()


sns.set_context("talk")
plt.style.use('ggplot')

data_entite.label.value_counts().plot.pie(y="Volume",wedgeprops=dict(width=0.5),
    figsize=(8, 8),
    autopct="%1.0f%%",
    pctdistance=0.75,cmap="Oranges_r",title="Distribution of entities in the article")

plt.legend(bbox_to_anchor=(1.05, 1), loc=2,
                  borderaxespad=0.0, fontsize='small')

plt.axis("off")
plt.tight_layout()
plt.show()


df_who = pd.DataFrame(data_entite[(data_entite.label == "NORP")
|(data_entite.label == "PERSON")|
(data_entite.label == "ORG")].value_counts().sort_values().sort_index(),columns=["total"])
df_who


df_where = pd.DataFrame(data_entite[data_entite.label == "GPE"].value_counts(),columns=["total"])
df_where


df_where= pd.DataFrame(data_entite[data_entite.label == "DATE"].value_counts(),columns=["total"])
df_where


sns.set_context("talk")
plt.style.use('ggplot')
stop_words=set(stopwords.words("english"))
df_wordcloud = WordCloud(stopwords=stop_words,background_color="white",colormap='inferno',random_state=2022).generate_from_frequencies(dic_freq2)


fig, axes = plt.subplots(figsize =(10, 10))

plt.imshow(df_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


#keep polarity 
data_text["polarity"] = data_text.transcript.apply(lambda x: sentiment_texblob(0,x))
#keep subjetivity
data_text["subjetivity"] = data_text.transcript.apply(lambda x: sentiment_texblob(1,x))


print("sentence with min polarity: ", data_text["polarity"].min(),
      "\nsentence: ", data_text["transcript"][data_text["polarity"].idxmin()])

sentence with min polarity:  -0.4318181818181818 
sentence:  The office of one of those politicians, Representative Elise Stefanik of New York, said in a statement that the Buffalo attack was an “act of evil” and that she “has never advocated for any racist position.”The matter is not so simple.Replacement theory is an attack on democracy.


print("sentence with max polarity: ",data_text["polarity"].max(), 
"\nsentence: ",data_text["transcript"][data_text["polarity"].idxmax()])

sentence with max polarity:  0.3333333333333333 
sentence:  As a practical matter, almost anyone can buy guns that are designed to kill a lot of people quickly.


print("sentence with min subjetivity: ",data_text["subjetivity"].min(), 
"\nsentence: ",data_text["transcript"][data_text["subjetivity"].idxmin()])

sentence with min subjetivity:  0.0 
sentence:  Gendron described himself as part of a movement.


print("sentence with max subjetivity: ",data_text["subjetivity"].max(), 
"\nsentence: ",data_text["transcript"][data_text["subjetivity"].idxmax()])

sentence with max subjetivity:  0.8 
sentence:  History has taught us that what begins with words ends in far worse.


polarity = sentiment_texblob(0,raw_text)
subjetivity = sentiment_texblob(1,raw_text)

print("polarity find in the article: ",polarity)
print("subjetivity find in the article: ",subjetivity)

polarity find in the article:  0.020867717352092346
subjetivity find in the article:  0.3544879599567099


#keep negative metric
data_text["neg"] = data_text.transcript.apply(lambda x: (get_sia(x))["neg"])
#keep neutral metric
data_text["neu"] = data_text.transcript.apply(lambda x: (get_sia(x))["neu"])
#keep positive metric
data_text["pos"] = data_text.transcript.apply(lambda x: (get_sia(x))["pos"])
#keep compound metric
data_text["compound"] = data_text.transcript.apply(lambda x: (get_sia(x))["compound"])


print("sentence most negative sentiment: ",data_text["compound"].min(), 
"\nsentence: ",data_text["transcript"][data_text["compound"].idxmin()])

sentence most negative sentiment:  -0.983 
sentence:  The suspects in several mass killings in recent years, including the 2015 murder of nine Black worshipers at a church in Charleston, S.C.; the 2018 murder of 11 Jewish worshipers at a synagogue in Pittsburgh; the 2019 murder of 51 Muslim worshipers at a pair of mosques in New Zealand; and the 2019 murder of 23 people, many Latino, in El Paso also propounded versions of this racist worldview.American life is punctuated by mass shootings that are routinely described as idiosyncratic.


print("sentence most positive sentiment: ",data_text["compound"].max(), 
"\nsentence: ",data_text["transcript"][data_text["compound"].idxmax()])

sentence most positive sentiment:  0.6124 
sentence:  The same argument is easily applied to other minority groups.The French author Renaud Camus coined the term “the great replacement” in a 2011 book to describe what he saw as a conscious effort by French elites to open the country’s doors for Muslim immigrants to replace the ethnically French population and culture.The template has been adapted for use by extremists around the world.


get_sia(raw_text)

{'neg': 0.133, 'neu': 0.804, 'pos': 0.063, 'compound': -0.9984}

	precision	recall	f1-score	support
0.0	0.963855	0.987654	0.975610	81.000000
1.0	0.979167	0.940000	0.959184	50.000000
accuracy	0.969466	0.969466	0.969466	0.969466
macro avg	0.971511	0.963827	0.967397	131.000000
weighted avg	0.969699	0.969466	0.969340	131.000000

	precision	recall	f1-score	support
0.0	0.975610	0.987654	0.981595	81.000000
1.0	0.979592	0.960000	0.969697	50.000000
accuracy	0.977099	0.977099	0.977099	0.977099
macro avg	0.977601	0.973827	0.975646	131.000000
weighted avg	0.977130	0.977099	0.977054	131.000000

	transcript
0	Republican politicians, including some of the ...
1	A recently published poll found that almost ha...
2	The suspect, identified as Payton S. Gendron, ...
3	Gendron described himself as part of a movement.
4	He said that he was inspired by similar attack...

	transcript	clean
0	Republican politicians, including some of the ...	republican politicians including some of the ...
1	A recently published poll found that almost ha...	a recently published poll found that almost ha...
2	The suspect, identified as Payton S. Gendron, ...	the suspect identified as payton s gendron ...
3	Gendron described himself as part of a movement.	gendron described himself as part of a movement
4	He said that he was inspired by similar attack...	he said that he was inspired by similar attack...

	transcript	clean	clean2
0	Republican politicians, including some of the ...	republican politicians including some of the ...	republican politicians including some of the ...
1	A recently published poll found that almost ha...	a recently published poll found that almost ha...	a recently published poll found that almost ha...
2	The suspect, identified as Payton S. Gendron, ...	the suspect identified as payton s gendron ...	the suspect identified as payton s gendron ...
3	Gendron described himself as part of a movement.	gendron described himself as part of a movement	gendron described himself as part of a movement
4	He said that he was inspired by similar attack...	he said that he was inspired by similar attack...	he said that he was inspired by similar attack...

IT Academy - Data Science Itinerary¶

S12 T01: Advanced machine learning¶

Exercise 1¶

Downloanding the dataset:¶

Preprocesing the dataset:¶

The RandomForest model:¶

Exercise 2¶

Geting the text from the web:¶

Text preprocesing:¶

Our functions:¶

Exercise 3¶

performance a Named Entity Recognition: (extra):¶

performance a wordCloud plot: (extra):¶

Exercise 4¶

Conclusions¶

References:¶

	party	infants	water	budget	physician	salvador	religious	satellite	aid	missile	immigration	synfuels	education	superfund	crime	duty_free_exports	eaa_rsa
0	republican	n	y	n	y	y	y	n	n	n	n	n	y	y	y	n	?
1	democrat	?	y	y	?	y	y	n	n	n	n	y	n	y	y	n	n
2	democrat	n	y	y	n	?	y	n	n	n	n	y	n	y	n	n	y
3	democrat	y	y	y	n	y	y	n	n	n	n	y	?	y	y	y	y
4	democrat	n	y	y	n	y	y	n	n	n	n	n	n	y	y	y	y

	party	infants	water	budget	physician	salvador	religious	satellite	aid	missile	immigration	synfuels	education	superfund	crime	duty_free_exports	eaa_rsa
0	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	1.0
1	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0
3	0.0	1.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	1.0
4	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
429	1.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
430	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0
431	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	1.0
432	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
433	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0

	label	entite
0	NORP	Republican
1	NORP	Americans
2	CARDINAL	almost half
3	NORP	Republicans
4	GPE	the United States

		total
label	entite
NORP	American	5
	Americans	5
	French	3
	German	1
	Hispanic	1
	Jewish	2
	Muslim	2
	Republican	2
	Republicans	2
	rhetoricians	1
ORG	Fox News	1
	GOP	1
	House	2
	Representative Elise Stefanik	1
	The New York Times	2
	the Democratic Party	1
PERSON	Angela Merkel’s	1
	Carlson	2
	Cheney	1
	Gendron	3
	George Soros	1
	Latino	1
	Liz Cheney	1
	Matt Gaetz	1
	Payton S. Gendron	1
	Robert Donald	1
	Stefanik	3
	Tucker Carlson	1

		total
label	entite
GPE	Buffalo	3
	the United States	3
	El Paso	2
	New Zealand	2
	America	1
	Charleston	1
	Florida	1
	New York	1
	Pittsburgh	1
	S.C.	1
	Texas	1
	Washington	1
	Wyoming	1

		total
label	entite
DATE	2019	2
	2011	1
	2015	1
	2018	1
	April 2021	1
	Monday	1
	Saturday	1
	September	1
	last year	1
	recent years	1

	transcript	clean	clean2	unique_token	freq	without_sw	freq2
0	Republican politicians, including some of the ...	republican politicians including some of the ...	republican politicians including some of the ...	[republican, politicians, including, some, of,...	{'republican': 1, 'politicians': 1, 'including...	[republican, politicians, including, partys, t...	{'republican': 1, 'politicians': 1, 'including...
1	A recently published poll found that almost ha...	a recently published poll found that almost ha...	a recently published poll found that almost ha...	[a, recently, published, poll, found, that, al...	{'a': 4, 'recently': 1, 'published': 1, 'poll'...	[recently, published, poll, found, almost, hal...	{'recently': 1, 'published': 1, 'poll': 1, 'fo...
2	The suspect, identified as Payton S. Gendron, ...	the suspect identified as payton s gendron ...	the suspect identified as payton s gendron ...	[the, suspect, identified, as, payton, s, gend...	{'the': 2, 'suspect': 1, 'identified': 1, 'as'...	[suspect, identified, payton, gendron, wrote, ...	{'suspect': 1, 'identified': 1, 'payton': 1, '...
3	Gendron described himself as part of a movement.	gendron described himself as part of a movement	gendron described himself as part of a movement	[gendron, described, himself, as, part, of, a,...	{'gendron': 1, 'described': 1, 'himself': 1, '...	[gendron, described, part, movement]	{'gendron': 1, 'described': 1, 'part': 1, 'mov...
4	He said that he was inspired by similar attack...	he said that he was inspired by similar attack...	he said that he was inspired by similar attack...	[he, said, that, he, was, inspired, by, simila...	{'he': 3, 'said': 1, 'that': 2, 'was': 1, 'ins...	[said, inspired, similar, attacks, minority, c...	{'said': 1, 'inspired': 1, 'similar': 1, 'atta...

	transcript	clean	clean2	unique_token	freq	without_sw	freq2	stemmer	freq_stemmer
0	Republican politicians, including some of the ...	republican politicians including some of the ...	republican politicians including some of the ...	[republican, politicians, including, some, of,...	{'republican': 1, 'politicians': 1, 'including...	[republican, politicians, including, partys, t...	{'republican': 1, 'politicians': 1, 'including...	[republican, politician, includ, parti, top, l...	{'republican': 1, 'politician': 1, 'includ': 1...
1	A recently published poll found that almost ha...	a recently published poll found that almost ha...	a recently published poll found that almost ha...	[a, recently, published, poll, found, that, al...	{'a': 4, 'recently': 1, 'published': 1, 'poll'...	[recently, published, poll, found, almost, hal...	{'recently': 1, 'published': 1, 'poll': 1, 'fo...	[recent, publish, poll, found, almost, half, r...	{'recent': 1, 'publish': 1, 'poll': 1, 'found'...
2	The suspect, identified as Payton S. Gendron, ...	the suspect identified as payton s gendron ...	the suspect identified as payton s gendron ...	[the, suspect, identified, as, payton, s, gend...	{'the': 2, 'suspect': 1, 'identified': 1, 'as'...	[suspect, identified, payton, gendron, wrote, ...	{'suspect': 1, 'identified': 1, 'payton': 1, '...	[suspect, identifi, payton, gendron, wrote, on...	{'suspect': 1, 'identifi': 1, 'payton': 1, 'ge...
3	Gendron described himself as part of a movement.	gendron described himself as part of a movement	gendron described himself as part of a movement	[gendron, described, himself, as, part, of, a,...	{'gendron': 1, 'described': 1, 'himself': 1, '...	[gendron, described, part, movement]	{'gendron': 1, 'described': 1, 'part': 1, 'mov...	[gendron, describ, part, movement]	{'gendron': 1, 'describ': 1, 'part': 1, 'movem...
4	He said that he was inspired by similar attack...	he said that he was inspired by similar attack...	he said that he was inspired by similar attack...	[he, said, that, he, was, inspired, by, simila...	{'he': 3, 'said': 1, 'that': 2, 'was': 1, 'ins...	[said, inspired, similar, attacks, minority, c...	{'said': 1, 'inspired': 1, 'similar': 1, 'atta...	[said, inspir, similar, attack, minor, commun,...	{'said': 1, 'inspir': 1, 'similar': 1, 'attack...

	words	freq
37	peopl	14
35	replac	14
9	white	12
18	american	10
12	theori	8

	party	infants	water	budget	physician	salvador	religious	satellite	aid	missile	immigration	synfuels	education	superfund	crime	duty_free_exports	eaa_rsa
0	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	1.0
1	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0
3	0.0	1.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	1.0
4	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
429	1.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
430	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0
431	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	1.0
432	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
433	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0

	party	infants	water	budget	physician	salvador	religious	satellite	aid	missile	immigration	synfuels	education	superfund	crime	duty_free_exports	eaa_rsa
0	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	1.0
1	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0
2	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0
3	0.0	1.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	1.0
4	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
429	1.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
430	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0
431	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	0.0	1.0
432	1.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	0.0	1.0	1.0	1.0	0.0	1.0
433	1.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	1.0	1.0	0.0	0.0