AI/MLOps

Kubernetes)Mlflow설치

MightyTedKim 2022. 2. 21. 14:50
728x90
반응형

 

mlflow는 모델을 저장하고 비교하는 플랫폼

서빙까지는 좀 무리가 있는것 같아요

airflow에서 mlflow로 던지는 방법으로 사용하려고 해요

결과

# $k get all -n mlflow
NAME                                     READY   STATUS    RESTARTS      AGE
pod/mlflow-deployment-75686677bf-k4bvx   1/1     Running   4 (51d ago)   59d
pod/mlflow-postgres-0                    1/1     Running   1 (51d ago)   59d

NAME                              TYPE       CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
service/mlflow-postgres-service   NodePort   10.98.250.201   <none>        5432:30112/TCP   59d
service/mlflow-service            NodePort   10.104.33.19    <none>        5000:30013/TCP   59d

NAME                                READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/mlflow-deployment   1/1     1            1           59d

NAME                                           DESIRED   CURRENT   READY   AGE
replicaset.apps/mlflow-deployment-75686677bf   1         1         1       59d

NAME                               READY   AGE
statefulset.apps/mlflow-postgres   1/1     59d

$k get pvc -n mlflow
NAME                           STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS      AGE
mlflow-pvc-mlflow-postgres-0   Bound    pvc-7182d621-5a80-46ad-9662-d1a33c57121f   100Mi      RWO            rook-ceph-block   59d
 

설치

mlflow-pgsql.yaml

mlflow-pvc.yaml
mlflow.yaml

# $ cat mlflow-pgsql.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: mlflow-postgres-config
  namespace: mlflow
  labels:
    app: mlflow-postgres
data:
  POSTGRES_DB: mlflow_db
  POSTGRES_USER: mlflow_user
  POSTGRES_PASSWORD: mlflow_pwd
  PGDATA: /var/lib/postgresql/mlflow/data
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: mlflow-postgres
  namespace: mlflow
  labels:
    app: mlflow-postgres
spec:
  selector:
    matchLabels:
      app: mlflow-postgres
  serviceName: "mlflow-postgres-service" 
  replicas: 1
  template:
    metadata:
      labels:
        app: mlflow-postgres
    spec:
      containers:
      - name: mlflow-postgres
        image: hgkim/library/postgres:11
        ports:
        - containerPort: 5432
          protocol: TCP
        envFrom:
        - configMapRef:
            name: mlflow-postgres-config
        resources:
          requests:
            memory: "1Gi" 
            cpu: "500m" 
        volumeMounts:
        - name: mlflow-pvc
          mountPath: /var/lib/postgresql/mlflow
  volumeClaimTemplates:
  - metadata:
      name: mlflow-pvc
    spec:
      accessModes: [ "ReadWriteOnce" ]
      resources:
        requests:
          storage: 100Mi
---
apiVersion: v1
kind: Service
metadata:
  name: mlflow-postgres-service
  namespace: mlflow
  labels:
    svc: mlflow-postgres-service
spec:
  type: NodePort
  ports:
   - nodePort: 30112
     port: 5432
     protocol: TCP
     targetPort: 5432
  selector:
    app: mlflow-postgres

$ cat mlflow-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name:  mlflow-pvc-mlflow-postgres-0
  namespace: mlflow
  labels:
    app: mlflow-postgres
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 100Mi
  storageClassName: rook-ceph-block

 

 

실행

- 설정

# $ cat requirements.txt
toml
pyarrow
s3fs==2021.11.1
boto3
mlflow
sklearn

$ cat conf/configs.toml
[app.mlflow]

object_storage_endpoint = 'http://10.***.35.32:30071'
object_storage_bucket   = "***" 
object_storage_key      = "model/1.raw_data/train.csv" 
mlflow_tracking_url     = "http://10.***.35.32:30013" 

#mlflow
alpha = 0.1
l1_ratio = 0.1
artifact_path = '***-model-wine'
registered_model_name = '***-model-wine'

 

- jupyterhub나 airflow에서 실행

# $ cat main.py
# Impor/ting in necessary libraries
import os
import numpy as np
import pandas as pd

import logging.config
import toml

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import ElasticNet

import mlflow
import mlflow.sklearn

import io
import pyarrow.parquet as pq
import s3fs
import argparse

configs = toml.load("./conf/configs.toml")
app_configs = configs["app"]
print(app_configs)

class MlflowOperations:
    def __init__(self, conf):
        print("==init==")
        self.conf = conf
        self.alpha = self.conf["alpha"]
        self.l1_ratio = self.conf["l1_ratio"]

        #for Logging model to MLFlo
        os.environ['MLFLOW_S3_ENDPOINT_URL'] = self.conf["object_storage_endpoint"]
        os.environ['AWS_ACCESS_KEY_ID'] = 'access-key'
        os.environ['AWS_SECRET_ACCESS_KEY'] = 'secret-key'

    def open_mlflow_session(self):
        print("==open_mlflow_session==")
        mlflow.set_tracking_uri(self.conf["mlflow_tracking_url"])
        mlflow.set_experiment("***-wine-test")
        return mlflow

    def load(self):
        print("==load==")
        object_storage_fs = s3fs.S3FileSystem(
            anon=False,
            use_ssl=False,
            client_kwargs={
                "region_name": "",
                "endpoint_url": self.conf['object_storage_endpoint'], #'http://10.233.21.208:80',
                "aws_access_key_id": 'access-key',
                "aws_secret_access_key": 'secret-key',
                "verify": False,
            }
        )
        return pd.read_csv(object_storage_fs.open('{}/{}'.format( self.conf['object_storage_bucket'], self.conf['object_storage_key']),mode='rb'))

    def split_data(self):
        print("==split_data==")
        # Loading data from a CSV file
        df_wine = self.load()
        print(df_wine)

        # # Separating the target class ('quality') from remainder of the training data
        X = df_wine.drop(columns = ['quality','kind'])
        y = df_wine[['quality']]
        # Splitting the data into training and validation sets
        X_train, self.X_val, y_train, self.y_val = train_test_split(X, y, random_state = 30)
        self.X_train = np.float32(X_train)
        self.y_train = np.float32(y_train)

        self.X_train = np.nan_to_num(X_train, nan=-9999, posinf=33333333, neginf=33333333)
        self.y_train = np.nan_to_num(y_train, nan=-9999, posinf=33333333, neginf=33333333)

    def train_data(self):
        mlfow = self.open_mlflow_session()
        # Running MLFlow script
        with mlflow.start_run():
            # Instantiating model with model parameters
            model = ElasticNet(alpha = self.alpha, l1_ratio = self.l1_ratio)
            # Fitting training data to the model
            model.fit(self.X_train, self.y_train)
            # Running prediction on validation dataset
            preds = model.predict(self.X_val)
            # Getting metrics on the validation dataset
            rmse = mean_squared_error(preds, self.y_val)
            abs_error = mean_absolute_error(preds, self.y_val)
            r2 = r2_score(preds, self.y_val)

        # Logging params and metrics to MLFlow
            mlflow.log_param('alpha', self.alpha)
            mlflow.log_param('l1_ratio', self.l1_ratio)
            mlflow.log_metric('rmse', rmse)
            mlflow.log_metric('abs_error', abs_error)
            mlflow.log_metric('r2', r2)
        # Logging training data
            #mlflow.log_artifact(local_path = './train.csv')
        # Logging training code
            #mlflow.log_artifact(local_path = './main.py')

        # Logging model to MLFlow
            mlflow.sklearn.log_model(sk_model = model,
                                     artifact_path = self.conf["artifact_path"],
                                     registered_model_name = self.conf["registered_model_name"] )

def main(conf):
    print("==main==")
    # argparse
    parser = argparse.ArgumentParser(description='Argparse')
    parser.add_argument('--alpha',    type=float,   default=0.1)
    parser.add_argument('--l1ratio',  type=float,   default=0.1)
    args    = parser.parse_args()

    # argument check
    if args.alpha != 0.1:
        conf["mlflow"]["alpha"] = args.alpha
    if args.l1ratio != 0.1:
        conf["mlflow"]["l1_ratio"] = args.l1ratio

    print("------")
    print("alpha: " + str(conf["mlflow"]["alpha"]))
    print("l1ratio: " + str(conf["mlflow"]["l1_ratio"]))
    print("------")

    session = MlflowOperations(conf["mlflow"])
    session.split_data()
    session.train_data()

if __name__ == "__main__":
    main(configs["app"])

 

728x90
반응형