Azure Machine Learning Studio - Notebooks

Import all the required libraries and formatting.

import pandas as pd
import numpy as np
import matplotlib.pylab as pylab
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 6
import warnings
import itertools
warnings.filterwarnings("ignore")
import pandas as pd
import statsmodels.api as sm
import matplotlib

Load Workspace

from azureml.core import Workspace
ws = Workspace.get(name="myworkspace", subscription_id='<subscription_id>', resource_group='myresourcegroup')

ws = Workspace.from_config()
ws.get_details()

Load data to Workspace

ds = ws.datasets['data_name']
df = ds.to_pandas_dataframe()

读取csv

df_raw = pd.read_csv(filepath_or_buffer = './test.csv', header = 0)

Upload data to container

default_store = ws.get_default_datastore() 
greenTaxiData='csvdata/green/part-00000'

default_store.upload_files(
    [greenTaxiData], 
    target_path = 'green', 
    overwrite = True, 
    show_progress = True
)

Show data

df.head(5) # 显示前5行
df.dtypes # 表字段和类型
df.describe() # summary statistics: count, mean, std, min, max
combined_df.transpose() # 转置

df['Column'].min() # min
df['Column'].max() # max

数据清洗

# 过滤空行
df1 = df_raw.dropna(axis=0, how="all", thresh=None, subset=None, inplace=False)
latlong_filtered_df1 = combined_df.dropna(subset=["field1", "field2"], axis = 0, how = "any")

# 重命名字段
green_df2 = green_df1.rename(
columns = {
    "cost": "field1", "distance": "field2",
})

# 保留有用字段
useful_columns = ["cost", "distance"]
green_df = green_df_raw[useful_columns]
green_df = green_df_raw[green_df_raw.columns[green_df_raw.columns.isin(useful_columns)]]

# Union两张表
combined_df = pd.concat([green_df, yellow_df], axis=0)

# 查找有空值的列
combined_df.isnull().any()
print(filtered_df.isnull().sum())

# 过滤行
filtered_df = df1[
    (df1['pickup_longitude'] <= -73.72) 
    & (filtered_df1['pickup_longitude'] >= -74.09)  
]

# 替换,填充
filtered_df['store_forward'] = filtered_df['store_forward'].replace([0], 'N')
filtered_df['store_forward'] = filtered_df['store_forward'].fillna('N')

# 删除列
df = df.drop(['pickup_second', 'dropoff_second'], axis=1)

数据格式转换

# 数值类型
df['distance'] = df['distance'].replace(".00", '0').apply(pd.to_numeric, errors='coerce')

# 日期类型
df['Month'] = pd.to_datetime(df['Month'])
df["pickup_datetime"] = df["pickup_datetime"].apply(pd.to_datetime)

# 拆分date和time 8/12/2013 10:06
df['pickupday'] = [d.date() for d in df['pickup_datetime']]
df['pickuptime'] = [d.time() for d in df['pickup_datetime']]

df['pickup_weekday'] = df['pickup_datetime'].dt.dayofweek
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_second'] = df['pickup_datetime'].dt.second

导出csv

final_df.to_csv("final_df.csv")

Split data

features_df = final_df[['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor']] # 特征
label_df = final_df['cost'] # 标签

from sklearn.model_selection import train_test_split

feature_train, feature_test, label_train, label_test = train_test_split(features_df, label_df, test_size=0.2, random_state=223)
# flatten y_train to 1d array
label_train.values.flatten()

auto-train 自动化训练

Used the automated machine learning to find the best run and the best fitted model

automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 30, 
    "primary_metric" : 'spearman_correlation',
    "featurization" : 'auto', 
    "verbosity" : logging.INFO, 
    "n_cross_validations": 
}

from azureml.train.automl import AutoMLConfig

# local compute
automated_ml_config = AutoMLConfig(
    task = 'regression',
    debug_log = 'automated_ml_errors.log',
    blocked_models =['XGBoostRegressor'],
    path = '.',
    X = feature_train.values,
    y = label_train.values.flatten(),
    **automl_settings
)

# Run the experiment locally
from azureml.core.experiment import Experiment

experiment=Experiment(ws, 'myAutoMLRegressionExp2')
local_run = experiment.submit(automated_ml_config, show_output=True)

比较不同算法结果

from azureml.widgets import RunDetails
RunDetails(local_run).show()

# 法一:View the run details in Job -> Models
# 法二:View the results in local run variable
children = list(local_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1) # 按index横向排序
rundata

# 选取效果最好的模型
best_run, fitted_model = local_run.get_output()
print(best_run) #Print Run ID, Type and Status
print(fitted_model) #Print Pipeline
# 根目录下自动生成 model.pkl

Train Model

自定义模型

from sklearn.linear_model import ElasticNet
from azureml.core import run

regr = ElasticNet(random_state=0)
regr.fit(feature_train, label_train)

model = "ElasticNet int %.2f coefs %s" % (regr.intercept_, print(regr.coef_))

yhat_train = regr.predict(feature_train)
yhat_test = regr.predict(feature_test)

Score Model

label_predict = fitted_model.predict(feature_test.values)

Evaluate Model

from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(label_actual, label_predict))
rmse

Save Model

import joblib

joblib.dump(fitted_model, 'automl_model.pkl')

Deploy Model

from azureml.core.model import Model

# register a model
model = Model.register(
    model_path = "nyc_taxifare_elasticnet_model.pkl",
    model_name = "nyc_taxifare_elasticnet_mdl",
    tags = {'area' : "cost", 'type' : "regression"},
    description = "Regression model",
    workspace = ws
)

# List models registered in the workspace
model_list = Model.list(workspace=ws)

Scoring File

%%writefile score.py # 创建文件

...

environment configuration (yml) file

MyModelEnv = CondaDependencies.create(conda_packages=['scikit-learn'])
with open("nyc_taxifare_model_env.yml","w") as f:
    f.write(MyModelEnv.serialize_to_string())
aciconfig = AciWebservice.deploy_configuration(...)
inference_config = InferenceConfig(...)
service = model.deploy(...)
service.wait_for_deployment(...)

Test Webservice

import requests

test_input2 = json.dumps({'data': [[2,4,15,3,5]]})

headers = {'Content-Type':'application/json'}
resp = requests.post(service.scoring_uri, test_input2, headers=headers)

prediction = json.loads(resp.text)
print(prediction)
Category: Cloud

Author: Yoga

Article
Tagcloud
DVA Java Express Architecture Azure CI/CD database ML AWS ETL nest sql AntV Next Deep Learning Flutter TypeScript Angular DevTools Microsoft egg Tableau SAP Token Regexp Unit test Nginx nodeJS sails wechat Jmeter HTML2Canvas Swift Jenkins JS event GTM Algorithm Echarts React-Admin Rest React hook Flux Redux ES6 Route Component Ref AJAX Form JSX Virtual Dom Javascript CSS design pattern