###pdf documents into data###
files<-list.files(pattern="pdf$", full.name=TRUE)
###split data and match the text###
filestextdataframe<-data.frame(Document=files,text = sapply(files, function(x) paste0(pdf_text(x), collapse = " ")))
###select only sentences including privacy"
corpus_privacy_sentences <- corpus(filestextdataframe$text)
tok_privacy_sentences <- tokens_select(tokens(corpus_privacy_sentences, what = "sentence"), pattern = "privacy", valuetype = "regex", selection = "keep")
###finished data,frame###
Table<-data.frame(Company= str_extract(filestextdataframe$Document, "3M|Abbott|AbbVie|Accenture|Adobe|ADP|AdvancedMicroDevices|alphabet|Altria|amazon|AmericanExpress|Amgen|Anthem|apple|AppliedMaterials|AT&T|BerkshireHathaway|BlackRock|Boeing|Booking|Bristol-MyersSquibb|Broadcom|Caterpillar|CharlesSchwab|Charter|Chevron|Cigna|Cisco|Citi|CocaCola|Comcast|Costco|CrownCastle|CVSHealth|Danaher|Deere|DukeEnergy|EliLilly|Exxon|facebook|Fidelity|GeneralElectric|GileadSciences|HomeDepot|Honeywell|Intel|Intuit|IntuitiveSurgical|Johnson&Johnson|JP|Lam|Linde|LockheedMartin|Lowe's|Mastercard|McDonalds|Medtronic|Merck|MicronTechnology|Mondelez|MorganStanley|Netflix|Nexteraenergy|Nike|NVIDIA|OracleCorporation|P&G|PayPal|PepsiCo|Pfizer|PhilipMoris|Prologis|Qualcomm|Raytheon|S&P|Salesforce|Servicenow|Starbucks|Stryker|T-Mobile|Target|Tesla|TexasInstruments|Thermo_Fisher_Scientific|TJX|UnionPacificCoporation|UnitedHealthGroup|UPS|Verizon|Visa|Walmart|WaltDisney|WellsFargo"),
Report=str_extract(filestextdataframe$Document, "annual|transcript|quarterly"),
Quartal=str_extract(filestextdataframe$Document, "annual|q1|q2|q3|q4"),
Date=str_extract(filestextdataframe$Document, "2015|2016|2017|2018|2019|2020|2021"),
Text=sapply(files, function(x) paste0(pdf_text(x), collapse = " ")),
privacy=sapply(tok_privacy_sentences, function(x) paste(as.character(x), collapse = " ")),
sector= ) ###new coloumn with matching sectors
####sector table###
sector_data<-list(
industrials=c("3M","Boeing","Caterpillar","Deere","GeneralElectric","Honeywell","LockheedMartin","Raytheon","UnionPacificCoporation","UPS"),
health_care=c("Abbott","AbbVie","Amgen","Anthem","Bristol","Cigna","CVSHealth","Danaher","EliLilly","GileadSciences","IntuitiveSurgical","Johnson&Johnson","MedTronic","Merck","Pfizer","Stryker","Thermo_Fischer_Scientific","UnitedHealthGroup"),
cummunication_services=c("Alphabet","AT&T","Charter","Comcast","Facebook","Netflix","T-Mobile","Verizon","WaltDisney"),
information_technology=c("Accenture","Adobe","Adobe","AdvancedMicroDevices","Apple","AppliedMaterials","Broadcom","Cisco","Fidelity","Intel","Intuit","Lam","Mastercard","MicronTechnology","NVIDIA","OracleCorporation","PayPal","Qualcomm","Salesforce","Servicenow","TexasInstruments","Visa"),
consumer_discretionary=c("Amazon","Booking","HomeDepot","Lowes","McDonalds","Nike","Starbucks","Target","Tesla","TJX"),
utilities=c("DukeEnergy","Nexteraenergy"),
financials=c("AmericanExpress","BerkshireHathaway","BlackRock","CharlesSchwab","Citi","JP","MorganStanley","S&P","WellsFargo"),
materials=c("Linde"),
real_estate=c("CrownCastle","Prologis"),
consumer_staples=c("Altria","CocaCola","Costco","Mondelez","P&G","PepsiCo","PhilipMoris","Walmart"),
energy=c("Chevron","Exxon"))
thats the code I got so far, sorry it´s a bit messy, don´t know how to post it properly tbh.