You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)
#!/usr/bin/env python3importnumpyasnpimportpandasaspdimportmatplotlib.pyplotaspltfromcollectionsimportdefaultdict, Counterfromdatetimeimportdatetimeimportpreprocessoraspimportrandom, os, utils, smart_open, json, codecs, pickle, timeimportgensimfromgensim.models.doc2vecimportDoc2Vec, TaggedDocumentfromsklearn.feature_extraction.textimportHashingVectorizer, TfidfVectorizerfromsklearn.metrics.pairwiseimportcosine_similarityfromscipy.fftpackimportfftdata_sources= ['phone.json']
defmain():
spd=Spd(data_sources) #class instantiationstart=time.process_time()
relevant_tweets=spd.detector(data_sources)
stop=time.process_time()
returnrelevant_tweetsclassSpd:
""" some functions to accept raw files, extract relevant fields and filter our irrelevent content"""def__init__(self, data_sources):
self.data_sources=data_sourcespass# first function in the class:defextractor(self, data_sources): # accept list of files consisting of raw tweets in form of json objectdata_extracts= {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
non_english_tweets=0# keep track of the non-English tweetswithcodecs.open('phone.json', 'r') asf: # data_source is read from extractor() functionforlineinf.readlines():
non_English=0try:
line=json.loads(line)
ifline['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
data_extracts['Language'].append(line['Language'])
data_extracts['TweetID'].append(line['TweetID'])
data_extracts['RawTweets'].append(line['RawTweets'])
data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
data_extracts['CreatedAt'].append(line['CreatedAt'])
data_extracts['AccountCreated'].append(line['AccountCreated'])
data_extracts['ScreenName'].append(line['ScreenName'])
data_extracts['RetweetCount'].append(line['RetweetCount'])
data_extracts['FollowersCount'].append(line['FollowersCount'])
data_extracts['FriendsCount'].append(line['FriendsCount'])
data_extracts['StatusesCount'].append(line['StatusesCount'])
data_extracts['FavouritesCount'].append(line['FavouritesCount'])
data_extracts['UserName'].append(line['UserName'])
data_extracts['Location'].append(line['Location'])
data_extracts['Description'].append(line['Description'])
data_extracts['UserURL'].append(line['UserURL'])
data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
data_extracts['UserID'].append(line['UserID'])
data_extracts['TimeZone'].append(line['TimeZone'])
data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
else:
non_english_tweets+=1except:
continuedf0=pd.DataFrame(data_extracts) #convert data extracts to pandas DataFramedf0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetimedf0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
df0=df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetimeAccountAge= [] # compute the account age of accountsdate_format="%Y-%m-%d %H:%M:%S"fordr,dcinzip(df0.CreatedAt, df0.AccountCreated):
#try:dr=str(dr)
dc=str(dc)
d1=datetime.strptime(dr,date_format)
d2=datetime.strptime(dc,date_format)
dif=d1-d2AccountAge.append(dif.days)
#except:#continuedf0['AccountAge']=AccountAge# add/define additional features ...df0['Retweets'] =df0.RawTweets.apply(lambdax: str(x).split()[0]=='RT' )
df0['RawTweetsLen'] =df0.RawTweets.apply(lambdax: len(str(x))) # modifieddf0['DescriptionLen'] =df0.Description.apply(lambdax: len(str(x)))
df0['UserNameLen'] =df0.UserName.apply(lambdax: len(str(x)))
df0['ScreenNameLen'] =df0.ScreenName.apply(lambdax: len(str(x)))
df0['LocationLen'] =df0.Location.apply(lambdax: len(str(x)))
df0['Activeness'] =df0.StatusesCount.truediv(df0.AccountAge)
df0['Friendship'] =df0.FriendsCount.truediv(df0.FollowersCount)
df0['Followership'] =df0.FollowersCount.truediv(df0.FriendsCount)
df0['Interestingness'] =df0.FavouritesCount.truediv(df0.StatusesCount)
df0['BidirFriendship'] = (df0.FriendsCount+df0.FollowersCount).truediv(df0.FriendsCount)
df0['BidirFollowership'] = (df0.FriendsCount+df0.FollowersCount).truediv(df0.FollowersCount)
df0['NamesRatio'] =df0.ScreenNameLen.truediv(df0.UserNameLen)
df0['CleanTweetsLen'] =df0.CleanTweets.apply(lambdax: len(str(x)))
df0['LexRichness'] =df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
# Remove all RTs, set UserID as index and save relevant files:df0=df0[df0.Retweets.values==False] # remove retweetsdf0=df0.set_index('UserID')
df0=df0[~df0.index.duplicated()] # remove duplicates in the tweet#df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csvdf0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv withopen(data_sources[:5]+'non_English.txt','w') asd: # save count of non-English tweetsd.write('{}'.format(non_english_tweets))
d.close()
returndf0defdetector(self, data_sources): # accept list of raw tweets as json objectsself.data_sources=data_sourcesfordata_sourcesindata_sources:
self.data_sources=data_sourcesdf0=self.extractor(data_sources)
#drop fields not required for predicitionX=df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
X=X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...X=X.dropna()
# reload the trained model for use:spd_filter=pickle.load(open('trained_rf.pkl','rb'))
PredictedClass=spd_filter.predict(X) # Predict spam or automated accounts/tweets:X['PredictedClass'] =PredictedClass# include the predicted class in the dataframenonspam=df0.loc[X.PredictedClass.values==1] # sort out the nonspam accountsspam=df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts#relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]relevant_tweets=nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
relevant_tweets=relevant_tweets.reset_index() # reset index and remove it from the dataframe#relevant_tweets = relevant_tweets.drop('UserID', axis=1) # save files:X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
spam.to_csv(data_source[:5]+'_spam_accounts.csv')
relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysisreturnrelevant_tweets# or return relevant_tweets, nonspam, spamif__name__=='__main__':
main()`
The text was updated successfully, but these errors were encountered:
Uh oh!
There was an error while loading. Please reload this page.
So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)
The text was updated successfully, but these errors were encountered: