当我在做这些事情时(直到此刻没有任何障碍)并尝试运行更多测试以获得更多结果,我得到了 MemoryError。

MemoryError                               Traceback (most recent call last)
<ipython-input-66-1ab060bc6067> in <module>
     22 for g in range(0,10000):
     23         # sample
---> 24         sample_df = stratified_sample(df,test,size=38, keep_index=False)
     25         pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
     26         example = "exampleFCUL"

<ipython-input-10-7aba847839db> in stratified_sample(df, strata, size, seed, keep_index)
     79         # final dataframe
     80         if first:
---> 81             stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
     82             first = False
     83         else:

D:\Anaconda\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
   3182         kwargs["level"] = kwargs.pop("level", 0) + 1
   3183         kwargs["target"] = None
-> 3184         res = self.eval(expr, **kwargs)
   3186         try:

D:\Anaconda\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
   3298             kwargs["target"] = self
   3299         kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
-> 3300         return _eval(expr, inplace=inplace, **kwargs)
   3302     def select_dtypes(self, include=None, exclude=None):

D:\Anaconda\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
    325         eng = _engines[engine]
    326         eng_inst = eng(parsed_expr)
--> 327         ret = eng_inst.evaluate()
    329         if parsed_expr.assigner is None:

D:\Anaconda\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
     69         # make sure no names in resolvers and locals/globals clash
---> 70         res = self._evaluate()
     71         return _reconstruct_object(
     72             self.result_type, res, self.aligned_axes, self.expr.terms.return_type

D:\Anaconda\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
    117             truediv = scope["truediv"]
    118             _check_ne_builtin_clash(self.expr)
--> 119             return ne.evaluate(s, local_dict=scope, truediv=truediv)
    120         except KeyError as e:
    121             # python 3 compat kludge

D:\Anaconda\lib\site-packages\numexpr\necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
    814     expr_key = (ex, tuple(sorted(context.items())))
    815     if expr_key not in _names_cache:
--> 816         _names_cache[expr_key] = getExprNames(ex, context)
    817     names, ex_uses_vml = _names_cache[expr_key]
    818     arguments = getArguments(names, local_dict, global_dict)

D:\Anaconda\lib\site-packages\numexpr\necompiler.py in getExprNames(text, context)
    706 def getExprNames(text, context):
--> 707     ex = stringToExpression(text, {}, context)
    708     ast = expressionToAST(ex)
    709     input_order = getInputOrder(ast, None)

D:\Anaconda\lib\site-packages\numexpr\necompiler.py in stringToExpression(s, types, context)
    282         else:
    283             flags = 0
--> 284         c = compile(s, '<expr>', 'eval', flags)
    285         # make VariableNode's for the names
    286         names = {}




def transform(multilevelDict):
    return {"t"+'_'+str(key) : (transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items()}
df = pd.read_csv('testingwebsitedata6.csv', sep=';')
df['Element_Count'] =  df['Element_Count'].apply((json.loads))
df['Tag_Count'] =  df['Tag_Count'].apply((json.loads))
for i in range(len(df['Tag_Count'])):
    df['Tag_Count'][i] = transform(df['Tag_Count'][i])
df1 = pd.DataFrame(df['Element_Count'].values.tolist())
df2 = pd.DataFrame(df['Tag_Count'].values.tolist())
df = pd.concat([df.drop('Element_Count', axis=1), df1], axis=1)
df= pd.concat([df.drop('Tag_Count', axis=1), df2], axis=1)
df= df.fillna(0)
df[df.select_dtypes(include=['float64']).columns]= df.select_dtypes(include=['float64']).astype(int)

test= ['link', 'document', 'heading', 'form', 'textbox', 'button', 'list', 'listitem', 'img', 'navigation', 'banner', 'main', 'article', 'contentinfo', 'checkbox', 'table', 'rowgroup', 'row', 'cell', 'listbox', 'presentation', 'figure', 'columnheader', 'separator', 'group', 'region', 't_html', 't_head', 't_title', 't_meta', 't_link', 't_script', 't_style', 't_body', 't_a', 't_div', 't_h1', 't_form', 't_label', 't_input', 't_ul', 't_li', 't_i', 't_img', 't_nav', 't_header', 't_span', 't_article', 't_p', 't_footer', 't_h3', 't_br', 't_noscript', 't_em', 't_strong', 't_button', 't_h2', 't_ol', 't_time', 't_center', 't_table', 't_tbody', 't_tr', 't_td', 't_font', 't_select', 't_option', 't_b', 't_figure', 't_figcaption', 't_u', 't_iframe', 't_caption', 't_thead', 't_th', 't_h5', 't_sup', 't_map', 't_area', 't_hr', 't_h4', 't_blockquote', 't_sub', 't_fieldset', 't_legend', 't_pre', 't_main', 't_section', 't_small', 't_tfoot', 't_textarea', 't_inserir', 't_s']

for g in range(0,10000):
        # sample
        sample_df = stratified_sample(df,test,size=38, keep_index=False)
        pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
        example = "exampleFCUL"
        randomnumber = g+1
        csv = ".csv"
        path = pathaux + '26'+'//'+ example +str(randomnumber) + csv
        chosencolumns= ["Uri"]
        sample_df.to_csv(path,sep=';', index = False, columns =chosencolumns, header = False)   


def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    A sampled pandas dataframe based in a set of strata.
    >> df.head()
        id  sex age city 
    0   123 M   20  XYZ
    1   456 M   25  XYZ
    2   789 M   21  YZX
    3   987 F   40  ZXY
    4   654 M   45  ZXY
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    - pandas
    - numpy
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
                qry = qry + stratum + ' == ' + str(value)
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    return stratified_df

def stratified_sample_report(df, strata, size=None):
    Generates a dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
            - cochran_n = result of the previous formula
            - N is the population size
    A dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd

def __smpl_size(population, size):
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
            - cochran_n = result of the previous formula
            - N is the population size

        :population: population size
        :size: sample size (default = None)
    Calculated sample size to be used in the functions:
        - stratified_sample
        - stratified_sample_report
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n 


标签: python-3.xpandasdataframe

