forked from edyoda/python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathml-solution.py
More file actions
32 lines (24 loc) · 834 Bytes
/
ml-solution.py
File metadata and controls
32 lines (24 loc) · 834 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
def check ( d ):
if d == "?":
return 0.0
else:
return float(d)
def process( row ):
trimmed = row#.replace("\"","")
label = int(trimmed[-1])
features = map(check, trimmed[4:])
return LabeledPoint(label, Vectors.dense(features))
sc = SparkContext("local[2]", "First Spark App")
rdd = sc.textFile("train.tsv")
rdd = rdd.map( lambda x: x.replace("\"",""))
rdd = rdd.map( lambda x: x.split("\t"))
res = rdd.map(process)
numIterations = 10
maxTreeDepth = 5
lrModel = LogisticRegressionWithSGD.train( res, numIterations)
datapoint = res.first()
print lrModel.predict(datapoint.features)