Data clustering and the k means algorithm. However, I'm not able to list all of the data sets but they include: ecoli.txt, glass.txt, ionoshpere.txt, iris_bezdek.txt, landsat.txt, letter_recognition.txt, segmentation.txt vehicle.txt, wine.txt and yeast.txt.
import java.io.*
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.io.StringReader;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import ca.pfv.spmf.patterns.cluster.DoubleArray;
import ca.pfv.spmf.tools.MemoryLogger;
import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction;
import ca.pfv.spmf.patterns.cluster.ClusterWithMean;
public class KMeansClu
{
private int [] _withLabel1;
private double [][] _K;
private double [][] _F;
private int [] _label1;
private int _K;
private int _Nrws, _ndims;
public KMeansClu(String F1, String label1name)
{
CSVHelper csv = new CSVHelper();
BufferedReaderr1 Readerr1;
ArrayList<String> Vals;
try
{
Readerr1 = new BufferedReaderr1(new FileReaderr1(F1));
_Nrws =1;
Vals = csv.parseLine(Readerr1);
_ndims = Vals.size();
while(Readerr1.readLine()!=null)
_Nrws++;
Readerr1.close();
System.out.println(_Nrws + " "+_ndims);
dat = new double[_Nrws][];
for (int i=0; i<_Nrws; i++)
dat[i] = new double[_ndms];
// here we are reading the records from the csv file
Readerr1 = new BufferedReaderr1(new FileReaderr1(F1));
int nrow=0;
while ((Vals = csv.parseLine(Readerr1))!=null){
double [] dv = new double[Vals.size()];
for (int i=0; i< Vals.size(); i++){
dv[i] = Double.parseDouble(Vals.get(i));
}
dat[nrow] = dv;
nrow ++;
}
Readerr1.close();
System.out.println("loaded F");
if (label1name!=null){
Readerr1 = new BufferedReaderr1(new FileReaderr1(label1name));
_withLabel1 = new int[_Nrws];
int c=0;
while ((Vals = csv.parseLine(Readerr1))!=null){
_withLabel1[c] = Integer.parseInt(Vals.get(0));
}
Readerr1.close();
System.out.println("loaded label1s");
}
}
catch(Exception e)
{
System.out.println( e );
System.exit( 0 );
}
}
public void Clsting(int K, int I, double [][] K)
{
_K = K;
if (K !=null)
_K = K;
else{
// here it will randomly selected K
_K = new double[_K][];
ArrayList idx= new ArrayList();
for (int i=0; i<K; i++){
int c;
do{
c = (int) (Math.random()*_Nrws);
}while(idx.contains(c)); // avoiding the unnecessary duplicates
idx.add(c);
_K[i] = new double[_ndms];
for (int j=0; j<_ndms; j++)
_K[i][j] = dat[c][j];
}
System.out.println("here we selected random K");
}
double [][] c1 = _K;
double T = 0.001;
int rounds1=0;
while (true){
_K = c1;
//assigning the values to the closest centroid
_label1 = new int[_Nrws];
for (int i=0; i<_Nrws; i++){
_label1[i] = closest(dat[i]);
}
// recomputing the K based on the assignments needed
c1 = updateK();
rounds1 ++;
if ((I >0 && rounds1 >=I) || converge(_K, c1, T))
break;
}
System.out.println("Clsting converges at rounds1 " + rounds1);
}
// find the closest centroid for the record v
private int closest(double [] v){
double mindist = dist(v, _K[0]);
int label1 =0;
for (int i=1; i<_K; i++){
double t = dist(v, _K[i]);
if (mindist>t){
mindist = t;
label1 = i;
}
}
return label1;
}
// compute Euclidean distance between two vectors v1 and v2
private double dist(double [] v1, double [] v2){
double sum=0;
for (int i=0; i<_ndms; i++){
double d = v1[i]-v2[i];
sum += d*d;
}
return Math.sqrt(sum);
}
// according to the cluster label1s, recompute the K
// the centroid is updated by averaging its members in the cluster.
// this only applies to Euclidean distance as the similarity measure.
private double [][] updateK(){
// initialize K and set to 0
double [][] newc = new double [_K][]; //new K
int [] counts = new int[_K]; // sizes of the clusters
// intialize
for (int i=0; i<_K; i++){
counts[i] =0;
newc[i] = new double [_ndms];
for (int j=0; j<_ndms; j++)
newc[i][j] =0;
}
for (int i=0; i<_Nrws; i++){
int cn = _label1[i]; // the cluster membership id for record i
for (int j=0; j<_ndms; j++){
newc[cn][j] += dat[i][j]; // update that centroid by adding the member F record
}
counts[cn]++;
}
// finally get the average
for (int i=0; i< _K; i++){
for (int j=0; j<_ndms; j++){
newc[i][j]/= counts[i];
}
}
return newc;
}
// check convergence condition
// max{dist(c1[i], c2[i]), i=1..K < T
private boolean converge(double [][] c1, double [][] c2, double T){
// c1 and c2 are two sets of K
double maxv = 0;
for (int i=0; i< _K; i++){
double d= dist(c1[i], c2[i]);
if (maxv<d)
maxv = d;
}
if (maxv <T)
return true;
else
return false;
}
public double[][] getK()
{
return _K;
}
public int [] getLabel1()
{
return _label1;
}
public int Nrws(){
return _Nrws;
}
public void printResults(){
System.out.println("Label1:");
for (int i=0; i<_Nrws; i++)
System.out.println(_label1[i]);
System.out.println("K:");
for (int i=0; i<_K; i++){
for(int j=0; j<_ndms; j++)
System.out.print(_K[i][j] + " ");
System.out.println();
}
}
public static void main( String[] astrArgs )
{
/**
* The code commented out here is just an example of how to use
* the provided functions and constructors.
*
*/
KMeansClu KM = new KMeansClu( "F.csv", null );
KM.Clsting(2, 10, null); // 2 clusters, maximum 10 iterations
KM.printResults();
/** using CSVHelper to parse strings
CSVHelper csv = new CSVHelper();
StringReaderr1 r= new StringReaderr1("x,y,z");
try{
ArrayList<String> ss = csv.parseLine(r);
for (String v:ss)
System.out.println(v);
}catch(Exception e){
System.err.println(e);
}
}
}
SSE means sum of squared error ..Cluster performance we can measure by using this metric for this i'm wring bisection of clustering and find the sse values in iterativemanner.
public class KMeansClus extends AlgoKMeansClus{
int I1 = -1;
public AlgoBisectingKMeansClus() {
}
public List<ClusterWithMean> runAlgo(String F, int k,
DistFun DistFun, int I1) throws NumberFormatException, IOException {
this.I1 = I1;
return runAlgo(F, k, DistFun);
}
void applyAlgorithm(int k, DistFun DistFun,
List<DoubleArray> vectors1, double Min_Val, double Max_Val,
int vectors1Size) {
List<DoubleArray> currentVectors1 = vectors1;
clusters = nw ArrayList<ClusterWithMean>();
while(true) {
List<ClusterWithMean> bestClustersUntilNow = null;
double smtSSE = Double.MAX_VALUE;
for(int i = 0; i < I1; i++) {
List<ClusterWithMean> nwClusters = applyKMeans(2, DistFun, currentVectors1, Min_Val, Max_Val, vectors1Size);
double sse = getSSE(nwClusters);
if(sse < smtSSE) {
bestClustersUntilNow = nwClusters;
smtSSE = sse;
}
}
clusters.addAll(bestClustersUntilNow);
if(clusters.size() == k){
break;
}
int biggestClusterSize = -1;
int biggestClusterIndex = -1;
for(int i =0; i < clusters.size(); i++) {
ClusterWithMean cluster = clusters.get(i);
if(cluster.getVectors1().size() > biggestClusterSize1) {
biggestClusterIndex = i;
biggestClusterSize1= cluster.getVectors1().size();
currentVectors1 = cluster.getVectors1();
}
}
clusters.remove(biggestClusterIndex);
}
}
public void printStatistics1() {
System.out.println("======== BISECTING KMEANS - STATSTICS ==========");
System.out.println(" Distance function: " + DistFun.getName());
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" SSE (Sum of Squared Errors) (lower is better) : " + getSSE(clusters));
System.out.println(" The Max memory is :" + MemoryLogger.getInstance().getMaxMemory() + " mb ");
}
}
Data clustering and the k means algorithm. However, I'm not able to list all of the...
Question 4 1 pts Which of the following reasons is not the reason why the K-means algorithm will likely end up with sub-optimal clustering? (Select all that apply.) Bad choices for the initial cluster centers. Choosing a k that corresponds to the number of natural clusters in the dataset. Fast convergence of the K-means algorithm. Existence of closely located data samples in the dataset. Question 5 1 pts Which of the following is a step in K-means algorithm implementation? (Select...
K-means clustering K-means clustering is a very well-known method of clustering unlabeled data. The simplicity of the process made it popular to data analysts. The task is to form clusters of similar data objects (points, properties etc.). When the dataset given is unlabeled, we try to make some conclusion about the data by forming clusters. Now, the number of clusters can be pre-determined and number of points can have any range. The main idea behind the process is finding nearest...
The Sieve of Eratosthenes is a simple, ancient algorithm for finding all prime numbers up to any given limit. It does so by iteratively marking as composite lie., not prime) the multiples of each prime, starting with the multiples of 2 The sieve of Eratosthenes can be expressed in pseudocode, as follows: Input: an integer n Let A be an array of 8oo1ean values, indexed by integers 2 to n, initially all set to true. for t - 2, 3,...
Given then following data pointsx(1) = (2, 8); x(2) = (2, 5); x(3) = (1, 2); x(4) = (5, 8)x(5) = (7, 3); x(6) = (6, 4); x(7) = (8, 4); x(8) = (4, 7)Compute 2 iterations of the K-Means algorithm by hand using the Forgy’s initialisation choosing x(3), x(4) and x(6). Calculate the loss function in each iteration.
write a complete Java program with comments in main and in each method. Data: The input data for this program is given as two columns of numbers. All data will be entered from a fle named input.txt and all output will go to the screen Assume there will not be more than 100 7 23.56 16 88.12 10 75.1 Design a Java class with a main method that does the following 1) Reads the data into two arrays of doubles,...
Must be written in JAVA Code Write a program that will read in a file of student academic credit data and create a list of students on academic warning. The list of students on warning will be written to a file. Each line of the input file will contain the student name (a single String with no spaces), the number of semester hours earned (an integer), the total quality points earned (a double). The following shows part of a typical...
you are going to write a program, fastfactor, to find all the inegral factors of a number (integer). the program must be written in C. note that in C (as in most languages) % is a remainder operator, so if x % 3 == 0 that means 3 is a factor of x. you are going to write fastfactor to work like a "standard" UNIX command: the user can invoke the command like fastfactor 12 13 which would output 12:...
What this Assignment Is About: Review on Java I topics, such as primitive data types, basic I/O, conditional and logical expressions, etc. Review on Java loops. Documentation Requirements to get full credits in Documentation The assignment number, your name, StudentID, Lecture number(time), and a class description need to be included at the top of each file/class. A description of each method is also needed. Some additional comments inside of methods (especially for a "main" method) to explain code that are...
Classification in Python: Classification In this assignment, you will practice using the kNN (k-Nearest Neighbors) algorithm to solve a classification problem. The kNN is a simple and robust classifier, which is used in different applications. The goal is to train kNN algorithm to distinguish the species from one another. The dataset can be downloaded from UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/ (Links to an external site.)Links to an external site.. Download `iris.data` file from the Data Folder. The Data Set description...
You will write a C program, q1 sequence.c, that computes the value of the nth term in any recursive sequence with the following structure: an = c1 · an−1 + c2 · an−2 a0 > 0 a1 > 0 c1 6= 0 c2 6= 0 Your C program will take 5 integer arguments on the command line: n, a0, a1, c1 and c2. n must be an integer greater than or equal to 0. If more or fewer arguments are...