---------------------1st Program-------------------------- import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import * # Load iris dataset and clean species names df = pd.read_csv('iris.csv') df['species'] = df['species'].str.strip().str.lower() df = df[df['species'].isin(['setosa', 'versicolor'])] # Binary encoding: versicolor = 1, setosa = 0 df['species'] = (df['species'] == 'versicolor').astype(int) X = df.drop(columns='species') y = df['species'] # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) # Logistic Regression model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test)[:, 1] rand_prob = np.random.rand(len(y_test)) # Confusion Matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() # Manual Metrics acc = (tp + tn) / (tp + tn + fp + fn) prec = tp / (tp + fp) rec = tp / (tp + fn) f1 = 2 * prec * rec / (prec + rec) mcc = matthews_corrcoef(y_test, y_pred) spec = tn / (tn + fp) npv = tn / (tn + fn) print("Manual Metrics:") print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}") print(f"Accuracy={acc:.2f}, Precision={prec:.2f}, Recall={rec:.2f}") print(f"F1={f1:.2f}, MCC={mcc:.2f}, Specificity={spec:.2f}, NPV={npv:.2f}") # Sklearn Metrics print("\nSklearn Metrics:") print(f"Accuracy={accuracy_score(y_test, y_pred):.2f}") print(f"Precision={precision_score(y_test, y_pred):.2f}") print(f"Recall={recall_score(y_test, y_pred):.2f}") print(f"F1={f1_score(y_test, y_pred):.2f}") print(f"MCC={matthews_corrcoef(y_test, y_pred):.2f}") # ROC Curve fpr1, tpr1, _ = roc_curve(y_test, y_prob) fpr2, tpr2, _ = roc_curve(y_test, rand_prob) plt.plot(fpr1, tpr1, label='Model') plt.plot(fpr2, tpr2, '--', label='Random') plt.plot([0, 1], [0, 1], 'k--', alpha=0.3) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC Curve") plt.legend(); plt.grid(); plt.tight_layout(); plt.show() # AUC print(f"\nAUC (Model): {roc_auc_score(y_test, y_prob):.2f}") print(f"AUC (Random): {roc_auc_score(y_test, rand_prob):.2f}") ---------------------2nd Program-------------------------- # maybe add a heatmap from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import pandas as pd # Load dataset data = load_iris() X, y = data.data, data.target df = pd.DataFrame(X, columns=data.feature_names) print(df.head()) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # ID3 Algorithm using entropy model = DecisionTreeClassifier(criterion='entropy') model.fit(X_train, y_train) # Predict a new sample new_sample = [[5.0, 3.6, 1.4, 0.2]] # Example from class 0 (Setosa) prediction = model.predict(new_sample) predicted_class = data.target_names[prediction[0]] # Plot tree plt.figure(figsize=(10, 6)) plot_tree(model, feature_names=data.feature_names, class_names=data.target_names, filled=True) plt.title("Decision Tree using ID3 (Entropy)") plt.show() # Accuracy on test set accuracy = model.score(X_test, y_test) # Output print(f"Predicted class for sample {new_sample}: {predicted_class}") print(f"Model accuracy on test set: {accuracy:.2f}") ---------------------3rd Program-------------------------- from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score,accuracy_score import matplotlib.pyplot as plt data=load_wine() x=data.data y=data.target trees=[1,5,10,20,30,40,50,90,100,150,600,750,1000] acc=[] f1=[] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42) for n in trees: model=RandomForestClassifier(n_estimators=n,random_state=1) model.fit(x_train,y_train) y_pred=model.predict(x_test) accuracy=accuracy_score(y_test,y_pred) f1s=f1_score(y_test,y_pred,average='macro') print(f"Trees : {n} | accuracy : {accuracy:.4f}, f1 : {f1s:.4f}") acc.append(accuracy) f1.append(f1s) plt.figure(figsize=(10, 6)) plt.plot(trees,acc,label='accuracy',color='orange',marker='o') plt.legend() plt.xlabel("number of trees") plt.ylabel("scores") plt.grid(True) plt.show() plt.figure(figsize=(10, 6)) plt.plot(trees,f1,label='f1 score',color='red',marker='*') plt.legend() plt.xlabel("number of trees") plt.ylabel("scores") plt.grid(True) ---------------------4th Program-------------------------- import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score # Create dataset data = { 'Hours': [1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0], 'Scores': [20, 25, 30, 35, 40, 50, 55, 60, 65, 70, 75, 80] } df = pd.DataFrame(data) # Split into features and target X = df[['Hours']] # 2D array y = df['Scores'] # 1D array # Split into training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train model model = LinearRegression() model.fit(X_train, y_train) # Predict on test data y_pred = model.predict(X_test) # Evaluate model rmse = np.sqrt(mean_squared_error(y_test, y_pred)) r2 = r2_score(y_test, y_pred) print("Root Mean Squared Error (RMSE):", round(rmse, 2)) print("R-squared (R²) Score:", round(r2, 2)) # Visualization plt.scatter(X, y, color='blue', label='Actual Data') plt.plot(X, model.predict(X), color='red', label='Regression Line') plt.xlabel('Hours Studied') plt.ylabel('Score') plt.title('Simple Linear Regression - Hours vs Scores') plt.legend() plt.grid(True) plt.show() ---------------------5th Program-------------------------- import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, mean_squared_error # Create dataset data = { 'Exam_Score': [35, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90], 'Admitted': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] } df = pd.DataFrame(data) # Features and label X = df[['Exam_Score']] y = df['Admitted'] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Train logistic regression model = LogisticRegression() model.fit(X_train, y_train) # Predict on test data y_pred = model.predict(X_test) y_proba = model.predict_proba(X_test)[:, 1] # New test data new_scores = pd.DataFrame({'Exam_Score': [58, 67, 77]}) new_predictions = model.predict(new_scores) # Evaluation metrics conf_matrix = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) # Output results print("Confusion Matrix:\n", conf_matrix) print("Accuracy:", round(acc, 2)) print("Precision:", round(prec, 2)) print("Recall:", round(rec, 2)) print("Mean Squared Error:", round(mse, 2)) print("Root Mean Squared Error:", round(rmse, 2)) print("New Test Predictions (58, 67, 77):", new_predictions.tolist()) # Visualization plt.scatter(X, y, color='blue', label='Actual') x_vals = np.linspace(30, 100, 300).reshape(-1, 1) plt.plot(x_vals, model.predict_proba(x_vals)[:, 1], color='red', label='Logistic Curve') plt.xlabel('Exam Score') plt.ylabel('Probability of Admission') plt.title('Logistic Regression - Admission Prediction') plt.legend() plt.grid(True) plt.show() ---------------------6th Program-------------------------- import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, confusion_matrix # Step 1: Create dataset data = { 'Age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '31-40'], 'Salary': ['Low', 'Medium', 'Low', 'Low', 'High', 'Medium', 'High', 'Medium', 'High', 'Medium', 'Medium'], 'Purchased': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes'] } df = pd.DataFrame(data) # Step 2: Preprocessing (label encoding) le_age = LabelEncoder() le_salary = LabelEncoder() le_purchase = LabelEncoder() df['Age'] = le_age.fit_transform(df['Age']) # <=30: 0, 31-40: 1, >40: 2 df['Salary'] = le_salary.fit_transform(df['Salary'])# Low: 1, Medium: 2, High: 0 df['Purchased'] = le_purchase.fit_transform(df['Purchased']) # No: 0, Yes: 1 # Step 3: Split features and label X = df[['Age', 'Salary']] y = df['Purchased'] # Step 4: Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Step 5: Train Naive Bayes Classifier model = GaussianNB() model.fit(X_train, y_train) # Step 6: Predict on test data y_pred = model.predict(X_test) # Step 7: New Test Input new_data = pd.DataFrame({ 'Age': le_age.transform(['<=30', '>40']), 'Salary': le_salary.transform(['High', 'Low']) }) new_prediction = model.predict(new_data) predicted_labels = le_purchase.inverse_transform(new_prediction) # Step 8: Evaluation conf_matrix = confusion_matrix(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred) print("Confusion Matrix:\n", conf_matrix) print("Accuracy:", round(accuracy, 2)) print("New Test Predictions:") print("['<=30', 'High'] ->", predicted_labels[0]) print("['>40', 'Low'] ->", predicted_labels[1]) ---------------------7th Program-------------------------- import numpy as np from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import accuracy_score data = load_diabetes() X, y = data.data, (data.target > 140).astype(int) # 1 = diabetic, 0 = non-diabetic X = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) def knn_predict(X_train, y_train, X_test, k, metric): preds = [] for x in X_test: dists = np.linalg.norm(X_train - x, axis=1) if metric == 'euclidean' else np.sum(np.abs(X_train - x), axis=1) idx = np.argsort(dists)[:k] votes = y_train[idx] preds.append(np.bincount(votes).argmax()) return np.array(preds) # Try different K and metrics for k in [3, 5, 7]: for metric in ['euclidean', 'manhattan']: y_pred = knn_predict(X_train, y_train, X_test, k, metric) acc = accuracy_score(y_test, y_pred) print(f"K={k}, Metric={metric}, Accuracy={acc:.2f}") ---------------------8th Program-------------------------- import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score # --- Step 1: Load Dataset --- data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) # --- Step 2: Preprocessing --- print("Null values in dataset:\n", df.isnull().sum()) print("\nDataset statistics:\n", df.describe()) # Standardize features scaler = StandardScaler() scaled_data = scaler.fit_transform(df) # Reduce to 2D for visualization pca = PCA(n_components=2) pca_data = pca.fit_transform(scaled_data) # --- Step 3: K-Means Clustering for K = 2, 3, 4 --- k_values = [2, 3, 4] results = {} for k in k_values: kmeans = KMeans(n_clusters=k, random_state=42) clusters = kmeans.fit_predict(scaled_data) # Save metrics results[k] = { "model": kmeans, "clusters": clusters, "inertia": kmeans.inertia_, "silhouette": silhouette_score(scaled_data, clusters) } # --- Step 4: Visualization --- plt.figure() plt.title(f"K-Means Clustering with K = {k}") plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap='viridis', s=50) centers = pca.transform(kmeans.cluster_centers_) plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='x', s=200, label='Centroids') plt.xlabel("PCA Component 1") plt.ylabel("PCA Component 2") plt.legend() plt.grid(True) plt.show() # --- Step 5: Compare Cluster Performance --- print("\n--- K-Means Clustering Performance Comparison ---") comparison_df = pd.DataFrame({ "K": k_values, "Inertia (SSE)": [results[k]["inertia"] for k in k_values], "Silhouette Score": [results[k]["silhouette"] for k in k_values] }) print(comparison_df) # --- Step 6: Inference / Analysis --- print("\n--- Inference / Analysis ---") print("1. Dataset: Iris (unsupervised; true labels not used).") print("2. Data was preprocessed via standardization and reduced to 2D with PCA for visualization.") print("3. K-Means was applied for K = 2, 3, 4.") print("4. Inertia (SSE) decreased with higher K, indicating tighter clusters.") print("5. Silhouette Score peaked at K = 2 or 3, suggesting optimal compactness and separation.") print("6. PCA plots show distinct clusters, especially for K = 3, which aligns with true Iris species.") print("7. K = 3 is likely optimal for this dataset based on visual and metric analysis.") ---------------------9th Program-------------------------- import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from scipy.cluster.hierarchy import linkage, dendrogram, fcluster from sklearn.decomposition import PCA # Load dataset iris = load_iris() X = iris.data features = iris.feature_names # Standardize scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Reduce to 2D for plotting pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # --- AGNES (Bottom-Up) --- link_agnes = linkage(X_scaled, method='ward') # 'ward' for minimum variance plt.figure(figsize=(8, 4)) dendrogram(link_agnes, truncate_mode='lastp', p=20, leaf_rotation=45., leaf_font_size=10.) plt.title("Dendrogram - AGNES (Bottom-Up)") plt.xlabel("Sample Index") plt.ylabel("Distance") plt.show() # Cut dendrogram to get 3 clusters labels_agnes = fcluster(link_agnes, t=3, criterion='maxclust') # Plot AGNES clusters plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_agnes, cmap='Set1') plt.title("AGNES Clustering Output (3 Clusters)") plt.xlabel("PCA 1") plt.ylabel("PCA 2") plt.show() # --- DIANA (Simulated Top-Down using complete linkage) --- link_diana = linkage(X_scaled, method='complete') # simulate divisive behavior plt.figure(figsize=(8, 4)) dendrogram(link_diana, truncate_mode='lastp', p=20, leaf_rotation=45., leaf_font_size=10.) plt.title("Dendrogram - DIANA (Top-Down Simulated)") plt.xlabel("Sample Index") plt.ylabel("Distance") plt.show() # Cut to get 3 clusters labels_diana = fcluster(link_diana, t=3, criterion='maxclust') # Plot DIANA clusters plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels_diana, cmap='Set2') plt.title("DIANA Clustering Output (3 Clusters - Simulated)") plt.xlabel("PCA 1") plt.ylabel("PCA 2") plt.show() # --- Inference --- print("\n--- Inference / Analysis ---") print("1. Dataset used: Iris (without labels for unsupervised clustering).") print("2. AGNES (bottom-up) used 'ward' linkage to build clusters from individual points.") print("3. DIANA (top-down) was simulated using 'complete' linkage (splits largest clusters first).") print("4. Dendrograms visually show different clustering hierarchies.") print("5. PCA-based scatter plots show that AGNES forms tighter, compact clusters.") print("6. DIANA clusters are broader and may not align tightly with true classes.") print("7. AGNES generally works better with compact, spherical clusters like those in Iris.") ---------------------10th Program-------------------------- # Import libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import AdaBoostClassifier from xgboost import XGBClassifier from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score, roc_auc_score, confusion_matrix, roc_curve) # Load binary classification dataset data = load_breast_cancer() X = pd.DataFrame(data.data, columns=data.feature_names) y = data.target # Preprocessing scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # Initialize models ada = AdaBoostClassifier(n_estimators=100, random_state=42) xgb = XGBClassifier(eval_metric='logloss', random_state=42) # Train and predict ada.fit(X_train, y_train) xgb.fit(X_train, y_train) ada_pred = ada.predict(X_test) xgb_pred = xgb.predict(X_test) ada_prob = ada.predict_proba(X_test)[:, 1] xgb_prob = xgb.predict_proba(X_test)[:, 1] # Evaluation function def evaluate_model(name, y_true, y_pred, y_prob): print(f"\n--- {name} ---") print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") print(f"Precision: {precision_score(y_true, y_pred):.4f}") print(f"Recall: {recall_score(y_true, y_pred):.4f}") print(f"F1-Score: {f1_score(y_true, y_pred):.4f}") print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}") cm = confusion_matrix(y_true, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap='Purples') plt.title(f"{name} - Confusion Matrix") plt.xlabel("Predicted"); plt.ylabel("Actual") plt.show() # Evaluate both models evaluate_model("AdaBoost", y_test, ada_pred, ada_prob) evaluate_model("XGBoost", y_test, xgb_pred, xgb_prob) # Plot ROC Curves fpr1, tpr1, _ = roc_curve(y_test, ada_prob) fpr2, tpr2, _ = roc_curve(y_test, xgb_prob) plt.plot(fpr1, tpr1, label='AdaBoost AUC') plt.plot(fpr2, tpr2, label='XGBoost AUC') plt.plot([0,1],[0,1],'k--') plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate") plt.title("ROC Curve Comparison") plt.legend() plt.show()