Segment violation (core generated)

0

I am applying the kmeans grouping algorithm and I am currently in the part of updating the centroids but I get an error of segment violation when I try to add the points of each centroid and divide it by the total number of points corresponding to each centroid , maybe if someone can help me to identify because that error is given, I would appreciate it.

This is the code, the error is presented to me in a function newCentroids.

    using namespace std;
    using SPoint = vector<pair<size_t, double>>;
    using Rates = map<pair<string, string>, double>; // (user,movie) -> rate


    Rates readNetflix(const string& fname) {
      ifstream input(fname);
      string line;
      size_t lines = 0;

      Rates rates;

      string currMovie;

      while (getline(input, line)) {
        if (line.back() == ':') {
          line.pop_back();
          currMovie = line;
          cout << "Movie: " << currMovie << endl;
        } else {
          size_t endUser = line.find_first_of(",");
          string currUser = line.substr(0, endUser);
          line.erase(0, endUser + 1);
          size_t endRate = line.find_first_of(",");
          string currRate = line.substr(0, endRate);
          rates[{currUser, currMovie}] = stoi(currRate);
        }
        lines++;
      }
      return rates;
    }


    vector<SPoint> createPoints(const Rates& rates) {
      map<string, size_t> normUsers;
      map<string, size_t> normMovies;
      for (const auto& e : rates) {
        const auto& user = e.first.first;
        const auto& movie = e.first.second;

        if (normUsers.count(user) == 0)
          normUsers[user] = normUsers.size();

        if (normMovies.count(movie) == 0)
          normMovies[movie] = normMovies.size();
      }
      cout << "End of normalization " << normUsers.size() << " "
           << normMovies.size() << endl;

      vector<SPoint> users(normUsers.size(),SPoint());
      for (const auto& e : rates) {
        size_t user = normUsers[e.first.first];
        size_t movie = normMovies[e.first.second];
        double rate = e.second;
        pair<size_t,double> p= make_pair(movie, rate);
        users[user].push_back(p);
      }
      return users;
    }



    double angle( const SPoint& p, const SPoint& q) {

      double p_punto=0;
      double norma_A=0;
      double norma_B=0;
      #pragma omp parallel for
      for (size_t i = 0; i < p.size(); i++) {
        p_punto=(p[i].first*q[i].first)+(p[i].second*q[i].second);
        norma_A=(sqrt(pow(p[i].first,2)+pow(p[i].second,2)));
        norma_B=(sqrt(pow(q[i].first,2)+pow(q[i].second,2)));
      }

      double A_Punto_B=norma_A*norma_B;
      double division=0;
        if (A_Punto_B != 0) {
           division=p_punto/A_Punto_B;
         }

      double arc_cos=acos(division);
      double rad_grados=(arc_cos*180)/3.1415926535;

      return rad_grados;
    }

    void printClustering(const vector<SPoint>& dataset,
                         const vector<size_t>& clustering, size_t k) {

      size_t n = dataset.size();
      vector<size_t> count(k, 0);
      for (size_t i = 0; i < n; i++) {
        size_t ci = clustering[i];
        count[ci]++;
      }
      for(size_t i = 0; i < k; i++) {
        cout << " cluster " << i << ": " << count[i] << endl;
      }
    }


    vector<SPoint> randomPoints(size_t k, const vector<SPoint>& ds) {
      random_device rd;
      mt19937 generator(rd());
      uniform_int_distribution<> unif(0, ds.size()-1);
      size_t dim = ds[0].size();
      vector<SPoint>  c(k,SPoint());
      //#pragma omp parallel
      //{
       //#pragma omp parallel for
       for (size_t i = 0; i < k; i++) {
         size_t r = unif(rd);
         pair<size_t,double> p= make_pair(r, r);
         c[i].push_back(p);
       }
      //}
      return c;
    }

    tuple<size_t, double> closestCentroid(const SPoint& p,
                                          const vector<SPoint>& centroids) {
      double d = numeric_limits<double>::max();
      size_t c = 0;
      //#pragma omp parallel
      //{
       //#pragma omp parallel for
       for (size_t i = 0; i < centroids.size(); i++) {
         double dt = angle(p, centroids[i]);
         if (dt < d) {
           d = dt;
           c = i;
         }
       }
      //}

      return make_tuple(c,d);
    }

    pair<vector<size_t>, double> cluster(const vector<SPoint>& dataset,
                                         const vector<SPoint>& centroids) {
      size_t n = dataset.size();
      vector<size_t> clustering(n, 0);
      double ssd = 0.0;
      for (size_t i = 0; i < n; i++) {
        size_t c;
        double d;
        tie(c, d) = closestCentroid(dataset[i], centroids);
        clustering[i] = c;
        ssd += d;
      }
      return {clustering, ssd};
    }

    vector<SPoint> newCentroids(const vector<size_t>& clustering,
                                const vector<SPoint>& dataset,
                                vector<SPoint>& centroids) {
      size_t k = centroids.size();
      size_t dim = centroids[0].size();
      vector<SPoint>  newCentroids(k,SPoint());
      vector<size_t> count(k, 0.0);

      for (size_t i = 0; i < dataset.size(); i++) {
        size_t ci = clustering[i];
        cout <<"ci: " << ci<<endl;
        for (size_t j = 0; j < dim; j++) {
          newCentroids[ci][j].first += dataset[i][j].first;
          newCentroids[ci][j].second += dataset[i][j].second;
        }
        count[ci]++;
      }



      //#pragma omp parallel
      for (size_t i = 0; i < k; i++) {
        for (size_t j = 0; j < dim; j++) {
            newCentroids[i][j].first /= count[i];
        }
      }
      return newCentroids;
      }
    //}

    vector<size_t> kmeans(const vector<SPoint>& dataset, size_t k, double epsilon,
                          size_t maxIter) {
      size_t dim = dataset[0].size();
      size_t n = dataset.size();
      vector<SPoint> centroids = randomPoints(k, dataset);
      vector<size_t> clustering(n, 0);
      double ssd = 0.0;
      double ssdPrev = 0.0;
      double d;
      size_t iter = 0;
      do{
        ssdPrev = ssd;
        cout << "Iteration " << iter << endl;
        tie(clustering, ssd) = cluster(dataset, centroids);
        cout << "SSD: " << ssd << endl;
        centroids = newCentroids(clustering,dataset,centroids);
        iter++;
        d = abs(ssdPrev - ssd);
        cout << "----> " << d << endl;

      }while(d>epsilon);

      return clustering;
    }


    int main(int argc, char** argv) {
      if (argc != 2)
        return -1;
      string fname(argv[1]);
      Rates rates = readNetflix(fname);
      vector<SPoint> ds = createPoints(rates);
      vector<size_t> clustering(21462,0);
      clustering = kmeans(ds, 3, 0.001, 8);
      printClustering(ds, clustering, 3);

      return 0;
    }
    
asked by Camilo Villegas 07.05.2018 в 03:00
source

1 answer

1
for (size_t j = 0; j < dim; j++) {
      newCentroids[ci][j].first += dataset[i][j].first;
      newCentroids[ci][j].second += dataset[i][j].second;
    }

Before accessing the elements you have to make sure that the object "dataset [i] [j]" is valid and is not nullptr, in case it has been deleted at some point or it is invalid to try to dereference it causes a segmetation fault

    
answered by 10.05.2018 в 13:00