$darkmode
404 Not Found

404 Not Found


nginx
OpenCV 4.11.0
Open Source Computer Vision
BEGIN_CUSTOM_MATHJAX // END_CUSTOM_MATHJAX
Load Caffe framework models

Next Tutorial: How to enable Halide backend for improve efficiency

Original author Vitaliy Lyudvichenko
Compatibility OpenCV >= 3.3

Introduction

In this tutorial you will learn how to use opencv_dnn module for image classification by using GoogLeNet trained network from Caffe model zoo.

We will demonstrate results of this example on the following picture.

Source Code

We will be using snippets from the example application, that can be downloaded here.

#include <fstream>
#include <sstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include "common.hpp"
std::string keys =
"{ help h | | Print help message. }"
"{ @alias | | An alias name of model to extract preprocessing parameters from models.yml file. }"
"{ zoo | models.yml | An optional path to file with preprocessing parameters }"
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ initial_width | 0 | Preprocess input image by initial resizing to a specific width.}"
"{ initial_height | 0 | Preprocess input image by initial resizing to a specific height.}"
"{ std | 0.0 0.0 0.0 | Preprocess input image by dividing on a standard deviation.}"
"{ crop | false | Preprocess input image by center cropping.}"
"{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
"{ needSoftmax | false | Use Softmax to post-process the output of the net.}"
"{ classes | | Optional path to a text file with names of classes. }"
"{ backend | 0 | Choose one of computation backends: "
"0: automatically (by default), "
"1: Halide language (http://halide-lang.org/), "
"2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
"3: OpenCV implementation, "
"4: VKCOM, "
"5: CUDA, "
"6: WebNN }"
"{ target | 0 | Choose one of target computation devices: "
"0: CPU target (by default), "
"1: OpenCL, "
"2: OpenCL fp16 (half-float precision), "
"3: VPU, "
"4: Vulkan, "
"6: CUDA, "
"7: CUDA fp16 (half-float preprocess) }";
using namespace cv;
using namespace dnn;
std::vector<std::string> classes;
int main(int argc, char** argv)
{
CommandLineParser parser(argc, argv, keys);
const std::string modelName = parser.get<String>("@alias");
const std::string zooFile = parser.get<String>("zoo");
keys += genPreprocArguments(modelName, zooFile);
parser = CommandLineParser(argc, argv, keys);
parser.about("Use this script to run classification deep learning networks using OpenCV.");
if (argc == 1 || parser.has("help"))
{
parser.printMessage();
return 0;
}
int rszWidth = parser.get<int>("initial_width");
int rszHeight = parser.get<int>("initial_height");
float scale = parser.get<float>("scale");
Scalar mean = parser.get<Scalar>("mean");
Scalar std = parser.get<Scalar>("std");
bool swapRB = parser.get<bool>("rgb");
bool crop = parser.get<bool>("crop");
int inpWidth = parser.get<int>("width");
int inpHeight = parser.get<int>("height");
String model = findFile(parser.get<String>("model"));
String config = findFile(parser.get<String>("config"));
String framework = parser.get<String>("framework");
int backendId = parser.get<int>("backend");
int targetId = parser.get<int>("target");
bool needSoftmax = parser.get<bool>("needSoftmax");
std::cout<<"mean: "<<mean<<std::endl;
std::cout<<"std: "<<std<<std::endl;
// Open file with classes names.
if (parser.has("classes"))
{
std::string file = parser.get<String>("classes");
std::ifstream ifs(file.c_str());
if (!ifs.is_open())
CV_Error(Error::StsError, "File " + file + " not found");
std::string line;
while (std::getline(ifs, line))
{
classes.push_back(line);
}
}
if (!parser.check())
{
parser.printErrors();
return 1;
}
CV_Assert(!model.empty());
Net net = readNet(model, config, framework);
net.setPreferableBackend(backendId);
net.setPreferableTarget(targetId);
// Create a window
static const std::string kWinName = "Deep learning image classification in OpenCV";
VideoCapture cap;
if (parser.has("input"))
cap.open(parser.get<String>("input"));
else
cap.open(0);
// Process frames.
Mat frame, blob;
while (waitKey(1) < 0)
{
cap >> frame;
if (frame.empty())
{
break;
}
if (rszWidth != 0 && rszHeight != 0)
{
resize(frame, frame, Size(rszWidth, rszHeight));
}
blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
// Check std values.
if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
{
// Divide blob by std.
divide(blob, std, blob);
}
net.setInput(blob);
// double t_sum = 0.0;
// double t;
int classId;
double confidence;
cv::TickMeter timeRecorder;
timeRecorder.reset();
Mat prob = net.forward();
double t1;
timeRecorder.start();
prob = net.forward();
timeRecorder.stop();
t1 = timeRecorder.getTimeMilli();
timeRecorder.reset();
for(int i = 0; i < 200; i++) {
timeRecorder.start();
prob = net.forward();
timeRecorder.stop();
Point classIdPoint;
minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
classId = classIdPoint.x;
// Put efficiency information.
// std::vector<double> layersTimes;
// double freq = getTickFrequency() / 1000;
// t = net.getPerfProfile(layersTimes) / freq;
// t_sum += t;
}
if (needSoftmax == true)
{
float maxProb = 0.0;
float sum = 0.0;
Mat softmaxProb;
maxProb = *std::max_element(prob.begin<float>(), prob.end<float>());
cv::exp(prob-maxProb, softmaxProb);
sum = (float)cv::sum(softmaxProb)[0];
softmaxProb /= sum;
Point classIdPoint;
minMaxLoc(softmaxProb.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
classId = classIdPoint.x;
}
std::string label = format("Inference time of 1 round: %.2f ms", t1);
std::string label2 = format("Average time of 200 rounds: %.2f ms", timeRecorder.getTimeMilli()/200);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
putText(frame, label2, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
// Print predicted class.
label = format("%s: %.4f", (classes.empty() ? format("Class #%d", classId).c_str() :
classes[classId].c_str()),
confidence);
putText(frame, label, Point(0, 55), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
imshow(kWinName, frame);
}
return 0;
}
a Class to measure passing time.
Definition: utility.hpp:326
void start()
starts counting ticks.
Definition: utility.hpp:335
void stop()
stops counting ticks.
Definition: utility.hpp:341
void reset()
resets internal values.
Definition: utility.hpp:430
double getTimeMilli() const
returns passed time in milliseconds.
Definition: utility.hpp:365
Scalar mean(InputArray src, InputArray mask=noArray())
Calculates an average (mean) of array elements.
void exp(InputArray src, OutputArray dst)
Calculates the exponent of every array element.
void divide(InputArray src1, InputArray src2, OutputArray dst, double scale=1, int dtype=-1)
Performs per-element division of two arrays or a scalar by an array.
Scalar sum(InputArray src)
Calculates the sum of array elements.
void minMaxLoc(InputArray src, double *minVal, double *maxVal=0, Point *minLoc=0, Point *maxLoc=0, InputArray mask=noArray())
Finds the global minimum and maximum in an array.
Point2i Point
Definition: modules/core/include/opencv2/core/types.hpp:209
std::string String
Definition: cvstd.hpp:149
Size2i Size
Definition: modules/core/include/opencv2/core/types.hpp:370
Scalar_< double > Scalar
Definition: modules/core/include/opencv2/core/types.hpp:709
cv::String findFile(const cv::String &relative_path, bool required=true, bool silentMode=false)
Try to find requested data file.
String format(const char *fmt,...)
Returns a text string formatted using the printf-like expression.
#define CV_Error(code, msg)
Call the error handler.
Definition: core/include/opencv2/core/base.hpp:335
#define CV_Assert(expr)
Checks a condition at runtime and throws exception if it fails.
Definition: core/include/opencv2/core/base.hpp:359
Mat blobFromImage(InputArray image, double scalefactor=1.0, const Size &size=Size(), const Scalar &mean=Scalar(), bool swapRB=false, bool crop=false, int ddepth=CV_32F)
Creates 4-dimensional blob from image. Optionally resizes and crops image from center,...
Net readNet(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config="", const String &framework="")
Read deep learning network represented in one of the supported formats.
GMat crop(const GMat &src, const Rect &rect)
Crops a 2D matrix.
@ WINDOW_NORMAL
the user can resize the window (no constraint) / also use to switch a fullscreen window to a normal s...
Definition: highgui.hpp:143
void imshow(const String &winname, InputArray mat)
Displays an image in the specified window.
int waitKey(int delay=0)
Waits for a pressed key.
void namedWindow(const String &winname, int flags=WINDOW_AUTOSIZE)
Creates a window.
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)
Draws a text string.
void line(InputOutputArray img, Point pt1, Point pt2, const Scalar &color, int thickness=1, int lineType=LINE_8, int shift=0)
Draws a line segment connecting two points.
@ FONT_HERSHEY_SIMPLEX
normal size sans-serif font
Definition: imgproc/include/opencv2/imgproc.hpp:901
void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation=INTER_LINEAR)
Resizes an image.
int main(int argc, char *argv[])
Definition: highgui_qt.cpp:3
@ StsError
unknown /unspecified error
Definition: core/include/opencv2/core/base.hpp:71
void scale(cv::Mat &mat, const cv::Mat &range, const T min, const T max)
Definition: quality_utils.hpp:90
Definition: core/include/opencv2/core.hpp:107
STL namespace.

Explanation

  1. Firstly, download GoogLeNet model files: bvlc_googlenet.prototxt and bvlc_googlenet.caffemodel

    Also you need file with names of ILSVRC2012 classes: classification_classes_ILSVRC2012.txt.

    Put these files into working dir of this program example.

  2. Read and initialize network using path to .prototxt and .caffemodel files

    Net net = readNet(model, config, framework);
    net.setPreferableBackend(backendId);
    net.setPreferableTarget(targetId);

    You can skip an argument framework if one of the files model or config has an extension .caffemodel or .prototxt. This way function cv::dnn::readNet can automatically detects a model's format.

  3. Read input image and convert to the blob, acceptable by GoogleNet

    VideoCapture cap;
    if (parser.has("input"))
    cap.open(parser.get<String>("input"));
    else
    cap.open(0);

    cv::VideoCapture can load both images and videos.

    blobFromImage(frame, blob, scale, Size(inpWidth, inpHeight), mean, swapRB, crop);
    // Check std values.
    if (std.val[0] != 0.0 && std.val[1] != 0.0 && std.val[2] != 0.0)
    {
    // Divide blob by std.
    divide(blob, std, blob);
    }

    We convert the image to a 4-dimensional blob (so-called batch) with 1x3x224x224 shape after applying necessary pre-processing like resizing and mean subtraction (-104, -117, -123) for each blue, green and red channels correspondingly using cv::dnn::blobFromImage function.

  4. Pass the blob to the network
    net.setInput(blob);
  5. Make forward pass
    // double t_sum = 0.0;
    // double t;
    int classId;
    double confidence;
    cv::TickMeter timeRecorder;
    timeRecorder.reset();
    Mat prob = net.forward();
    double t1;
    timeRecorder.start();
    prob = net.forward();
    timeRecorder.stop();
    t1 = timeRecorder.getTimeMilli();
    timeRecorder.reset();
    for(int i = 0; i < 200; i++) {
    During the forward pass output of each network layer is computed, but in this example we need output from the last layer only.
  6. Determine the best class
    Point classIdPoint;
    minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
    classId = classIdPoint.x;
    We put the output of network, which contain probabilities for each of 1000 ILSVRC2012 image classes, to the prob blob. And find the index of element with maximal value in this one. This index corresponds to the class of the image.
  7. Run an example from command line
    ./example_dnn_classification --model=bvlc_googlenet.caffemodel --config=bvlc_googlenet.prototxt --width=224 --height=224 --classes=classification_classes_ILSVRC2012.txt --input=space_shuttle.jpg --mean="104 117 123"
    For our image we get prediction of class space shuttle with more than 99% sureness.