首页 > 解决方案 > 使用 Pyspark 训练随机森林回归模型

问题描述

我正在使用从 stocktwits 中提取的 json 格式提取的数据进行情绪分析项目。每条推文都分配给一个情绪分数,该分数是一个介于 0 和 1 之间的浮点数。我想使用 pyspark Mllib 训练随机森林。

下面是我的代码:

我将 SparkDF 转换为 RDD,然后尝试应用 RandomForest 模型

import nltk
import time
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

import csv
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize,blankline_tokenize
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import re
import string
from collections import Counter
import json
import re as regex
import xgboost as xgb
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import numpy as np
import findspark
findspark.init()
#findspark.init("C:\opt\spark\spark-2.3.0-bin-hadoop2.7")
import pyspark.sql.types as typ
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder \
   .master("local") \
   .appName("Spark ML") \
   .getOrCreate()
#read json file into Spark DataFrame
#create spark dataframe from Pandas DF
df=pd.read_json("Microblog_Trialdata.json")
def list_sp(row):
idx=row["spans"]
#ch="".join(x for x in idx if x)
ch=' '.join(idx)
row["spans"]=ch
return row
df = df.apply(list_sp, axis=1)
train_data=sparkSession.createDataFrame(df)
df.head()
#extraire les champ de sentiment score et tweets
import pyspark.mllib.regression
from pyspark.mllib.regression import LabeledPoint
train_data.show()
spans=train_data.select("sentiment score","spans")
#renommer le champ sentiment score en "label"
spans=spans.toDF("label","spans")
#remove some additional features(numbers and unctuatuion)
import pyspark.ml.feature as ft
tokenizer = ft.RegexTokenizer(
inputCol='spans',
outputCol='tokens',
pattern='\s+|[$,.\"]')
#tokenize the text
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
countTokens = udf(lambda words: len(words), IntegerType())
tok = tokenizer \
    .transform(new) \
#now remove stopwords from the review(list of words)    
from pyspark.ml.feature import StopWordsRemover

remover=StopWordsRemover(inputCol="tokens", outputCol="filtered")
filtered_df=remover.transform(tok)
#now make 2-gram model
from pyspark.ml.feature import NGram

ngram=NGram(n=1, inputCol="filtered", outputCol="n-gram")
gram_df=ngram.transform(filtered_df)
#apply countvectorizer model
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="n-gram", outputCol="features", vocabSize=20, minDF=2.0)

model = cv.fit(gram_df)

result = model.transform(gram_df)
#former le pipeline totale
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, ngram,cv])
pipelineFit = pipeline.fit(spans)
final_df = pipelineFit.transform(spans)
from pyspark.mllib.tree import RandomForest
model = RandomForest.trainRegressor(train_df,{}, numTrees =10,maxDepth =None,maxBins =32, seed=42)

标签: apache-sparkpysparkapache-spark-mllibrandom-forest

解决方案


如果您的问题是将数据帧转换为 RDD,那么您只需像这样使用 .rdd

    newRDD = yourdataframe.rdd 

推荐阅读