Skip to content

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required. #912

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
balalaunicorn opened this issue Jul 5, 2022 · 2 comments

Comments

@balalaunicorn
Copy link

balalaunicorn commented Jul 5, 2022

So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)

#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft

data_sources = ['phone.json']

def main():
    spd = Spd(data_sources) #class instantiation
    start = time.process_time()
    relevant_tweets = spd.detector(data_sources)
    stop = time.process_time()
    return relevant_tweets




class Spd:
    """ some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
    def __init__(self, data_sources):
        self.data_sources = data_sources
    pass
        
    # first function in the class:
    def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
        data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
                         'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
                         'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
                         'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
        non_english_tweets = 0 # keep track of the non-English tweets
        with codecs.open('phone.json', 'r') as f: # data_source is read from extractor() function
            for line in f.readlines():
                non_English = 0
                try:
                    line = json.loads(line)
                    if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
                        data_extracts['Language'].append(line['Language'])
                        data_extracts['TweetID'].append(line['TweetID'])
                        data_extracts['RawTweets'].append(line['RawTweets'])
                        data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
                        data_extracts['CreatedAt'].append(line['CreatedAt'])
                        data_extracts['AccountCreated'].append(line['AccountCreated'])                       
                        data_extracts['ScreenName'].append(line['ScreenName'])                          
                        data_extracts['RetweetCount'].append(line['RetweetCount'])
                        data_extracts['FollowersCount'].append(line['FollowersCount'])
                        data_extracts['FriendsCount'].append(line['FriendsCount'])
                        data_extracts['StatusesCount'].append(line['StatusesCount'])
                        data_extracts['FavouritesCount'].append(line['FavouritesCount'])
                        data_extracts['UserName'].append(line['UserName'])
                        data_extracts['Location'].append(line['Location'])
                        data_extracts['Description'].append(line['Description'])
                        data_extracts['UserURL'].append(line['UserURL'])
                        data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
                        data_extracts['UserID'].append(line['UserID'])
                        data_extracts['TimeZone'].append(line['TimeZone'])
                        data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
                    else:
                        non_english_tweets +=1
                except:
                    continue
            df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
            df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
            df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
            df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
            AccountAge = [] # compute the account age of accounts
            date_format = "%Y-%m-%d  %H:%M:%S"
            for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
                #try:
                dr = str(dr)
                dc = str(dc)
                d1 = datetime.strptime(dr,date_format)
                d2 = datetime.strptime(dc,date_format)
                dif = d1 - d2
                AccountAge.append(dif.days)
                #except:
                    #continue
            df0['AccountAge']=AccountAge
            # add/define additional features ...
            df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
            df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
            df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
            df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
            df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
            df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
            df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
            df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
            df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
            df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
            df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
            df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
            df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
            df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
            df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)       
            # Remove all RTs, set UserID as index and save relevant files:
            df0 = df0[df0.Retweets.values==False] # remove retweets
            df0 = df0.set_index('UserID')
            df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
            #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
            df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv 
            with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
                d.write('{}'.format(non_english_tweets))
                d.close()
        return df0

    
    def detector(self, data_sources): # accept list of raw tweets as json objects
        self.data_sources = data_sources
        for data_sources in data_sources:
            self.data_sources = data_sources
            df0 = self.extractor(data_sources)
            #drop fields not required for predicition
            X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
                 'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
            X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
            X = X.dropna()
            # reload the trained model for use:
            spd_filter=pickle.load(open('trained_rf.pkl','rb'))
            PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
            X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
            nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
            spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
            #relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
            relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
            relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
            #relevant_tweets = relevant_tweets.drop('UserID', axis=1) 
            # save files:
            X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
            nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
            spam.to_csv(data_source[:5]+'_spam_accounts.csv')
            relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
        return relevant_tweets # or return relevant_tweets, nonspam, spam

if __name__ =='__main__':
    main()`
@glemaitre
Copy link
Member

Please provide a minimal example

@glemaitre
Copy link
Member

You can have a look at https://scikit-learn.org/dev/developers/minimal_reproducer.html to craft one.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants